Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,88 +1,51 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
import
|
4 |
-
|
5 |
-
import
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
pixel_values = processor(image, return_tensors="pt").pixel_values
|
22 |
-
|
23 |
-
# Generate text
|
24 |
-
generated_ids = model.generate(pixel_values)
|
25 |
-
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
26 |
-
|
27 |
-
return generated_text
|
28 |
-
except Exception as e:
|
29 |
-
st.error(f"An error occurred during OCR processing: {str(e)}")
|
30 |
-
return None
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
return []
|
35 |
|
36 |
-
#
|
37 |
-
|
38 |
-
matches = list(pattern.finditer(text))
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
results.append(highlighted)
|
49 |
-
|
50 |
-
return results
|
51 |
-
|
52 |
-
st.title("OCR and Text Search Application")
|
53 |
-
|
54 |
-
uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])
|
55 |
-
|
56 |
-
if uploaded_file is not None:
|
57 |
-
try:
|
58 |
-
image = Image.open(uploaded_file)
|
59 |
-
st.image(image, caption="Uploaded Image", use_column_width=True)
|
60 |
|
61 |
-
if
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
st.
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
# Search functionality
|
78 |
-
search_keyword = st.text_input("Enter a keyword to search:")
|
79 |
-
if search_keyword and 'extracted_text' in st.session_state:
|
80 |
-
search_results = search_text(st.session_state.extracted_text, search_keyword)
|
81 |
-
if search_results:
|
82 |
-
st.success(f"Found {len(search_results)} matches for '{search_keyword}':")
|
83 |
-
for i, result in enumerate(search_results, 1):
|
84 |
-
st.markdown(f"{i}. ...{result}...")
|
85 |
-
else:
|
86 |
-
st.warning(f"No matches found for '{search_keyword}'.")
|
87 |
-
elif search_keyword:
|
88 |
-
st.info("Please perform OCR on an image before searching.")
|
|
|
1 |
+
# app.py
|
2 |
+
import streamlit as st
|
3 |
+
import cv2
|
4 |
+
import numpy as np
|
5 |
+
import pytesseract
|
6 |
+
from PIL import Image
|
7 |
+
import re
|
8 |
+
|
9 |
+
# Set the title of the webpage
|
10 |
+
st.title("OCR Text Extraction Tool")
|
11 |
+
|
12 |
+
# Uploading an image
|
13 |
+
uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
|
14 |
+
|
15 |
+
if uploaded_file is not None:
|
16 |
+
# Convert the uploaded file content to an image
|
17 |
+
image = Image.open(uploaded_file)
|
18 |
|
19 |
+
# Convert PIL Image to OpenCV format
|
20 |
+
opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
# Display the image
|
23 |
+
st.image(image, caption='Uploaded Image', use_column_width=True)
|
|
|
24 |
|
25 |
+
# Perform OCR
|
26 |
+
text = pytesseract.image_to_string(opencv_image)
|
|
|
27 |
|
28 |
+
st.subheader("Extracted Text:")
|
29 |
+
st.write(text)
|
30 |
+
|
31 |
+
# Search functionality
|
32 |
+
search_keyword = st.text_input("Enter a keyword to search in the extracted text:")
|
33 |
+
if search_keyword:
|
34 |
+
pattern = re.compile(re.escape(search_keyword), re.IGNORECASE)
|
35 |
+
matches = list(pattern.finditer(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
if matches:
|
38 |
+
st.markdown("### Keyword Found:")
|
39 |
+
for match in matches:
|
40 |
+
start, end = match.span()
|
41 |
+
context_start = max(0, start - 50)
|
42 |
+
context_end = min(len(text), end + 50)
|
43 |
+
context = text[context_start:context_end]
|
44 |
+
highlighted_text = (
|
45 |
+
context[:start-context_start] +
|
46 |
+
f"<span style='background-color: yellow;'>{context[start-context_start:end-context_start]}</span>" +
|
47 |
+
context[end-context_start:]
|
48 |
+
)
|
49 |
+
st.markdown(f"...{highlighted_text}...")
|
50 |
+
else:
|
51 |
+
st.write(f"Keyword '{search_keyword}' not found in the extracted text.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|