PRIYANSHUDHAKED commited on
Commit
ec4e14b
·
verified ·
1 Parent(s): 7efa875

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -83
app.py CHANGED
@@ -1,88 +1,51 @@
1
- import streamlit as st
2
- from PIL import Image
3
- import torch
4
- from transformers import AutoProcessor, AutoModelForVision2Seq
5
- import re
6
-
7
- # Load OCR model
8
- model_name = "microsoft/trocr-large-handwritten" # Using a larger model for better performance
9
- processor = AutoProcessor.from_pretrained(model_name)
10
- model = AutoModelForVision2Seq.from_pretrained(model_name)
11
-
12
- @st.cache_resource
13
- def load_model():
14
- return processor, model
15
-
16
- def perform_ocr(image):
17
- processor, model = load_model()
18
 
19
- try:
20
- # Preprocess the image
21
- pixel_values = processor(image, return_tensors="pt").pixel_values
22
-
23
- # Generate text
24
- generated_ids = model.generate(pixel_values)
25
- generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
26
-
27
- return generated_text
28
- except Exception as e:
29
- st.error(f"An error occurred during OCR processing: {str(e)}")
30
- return None
31
 
32
- def search_text(text, keyword):
33
- if not keyword or not text:
34
- return []
35
 
36
- # Case-insensitive search
37
- pattern = re.compile(re.escape(keyword), re.IGNORECASE)
38
- matches = list(pattern.finditer(text))
39
 
40
- # Get context around matches
41
- context_size = 50 # characters before and after the match
42
- results = []
43
- for match in matches:
44
- start = max(0, match.start() - context_size)
45
- end = min(len(text), match.end() + context_size)
46
- context = text[start:end]
47
- highlighted = re.sub(pattern, lambda m: f"**{m.group()}**", context)
48
- results.append(highlighted)
49
-
50
- return results
51
-
52
- st.title("OCR and Text Search Application")
53
-
54
- uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])
55
-
56
- if uploaded_file is not None:
57
- try:
58
- image = Image.open(uploaded_file)
59
- st.image(image, caption="Uploaded Image", use_column_width=True)
60
 
61
- if st.button("Perform OCR"):
62
- with st.spinner("Processing image..."):
63
- extracted_text = perform_ocr(image)
64
-
65
- if extracted_text:
66
- st.success("OCR completed successfully!")
67
- st.subheader("Extracted Text:")
68
- st.write(extracted_text)
69
-
70
- # Save extracted text to session state
71
- st.session_state.extracted_text = extracted_text
72
- else:
73
- st.error("Failed to extract text from the image. Please try another image.")
74
- except Exception as e:
75
- st.error(f"An error occurred while processing the image: {str(e)}")
76
-
77
- # Search functionality
78
- search_keyword = st.text_input("Enter a keyword to search:")
79
- if search_keyword and 'extracted_text' in st.session_state:
80
- search_results = search_text(st.session_state.extracted_text, search_keyword)
81
- if search_results:
82
- st.success(f"Found {len(search_results)} matches for '{search_keyword}':")
83
- for i, result in enumerate(search_results, 1):
84
- st.markdown(f"{i}. ...{result}...")
85
- else:
86
- st.warning(f"No matches found for '{search_keyword}'.")
87
- elif search_keyword:
88
- st.info("Please perform OCR on an image before searching.")
 
1
+ # app.py
2
+ import streamlit as st
3
+ import cv2
4
+ import numpy as np
5
+ import pytesseract
6
+ from PIL import Image
7
+ import re
8
+
9
+ # Set the title of the webpage
10
+ st.title("OCR Text Extraction Tool")
11
+
12
+ # Uploading an image
13
+ uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
14
+
15
+ if uploaded_file is not None:
16
+ # Convert the uploaded file content to an image
17
+ image = Image.open(uploaded_file)
18
 
19
+ # Convert PIL Image to OpenCV format
20
+ opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # Display the image
23
+ st.image(image, caption='Uploaded Image', use_column_width=True)
 
24
 
25
+ # Perform OCR
26
+ text = pytesseract.image_to_string(opencv_image)
 
27
 
28
+ st.subheader("Extracted Text:")
29
+ st.write(text)
30
+
31
+ # Search functionality
32
+ search_keyword = st.text_input("Enter a keyword to search in the extracted text:")
33
+ if search_keyword:
34
+ pattern = re.compile(re.escape(search_keyword), re.IGNORECASE)
35
+ matches = list(pattern.finditer(text))
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ if matches:
38
+ st.markdown("### Keyword Found:")
39
+ for match in matches:
40
+ start, end = match.span()
41
+ context_start = max(0, start - 50)
42
+ context_end = min(len(text), end + 50)
43
+ context = text[context_start:context_end]
44
+ highlighted_text = (
45
+ context[:start-context_start] +
46
+ f"<span style='background-color: yellow;'>{context[start-context_start:end-context_start]}</span>" +
47
+ context[end-context_start:]
48
+ )
49
+ st.markdown(f"...{highlighted_text}...")
50
+ else:
51
+ st.write(f"Keyword '{search_keyword}' not found in the extracted text.")