Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
3 |
+
from qwen_vl_utils import process_vision_info
|
4 |
+
import torch
|
5 |
+
from PIL import Image
|
6 |
+
|
7 |
+
# Load model on CPU
|
8 |
+
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
9 |
+
"Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.float32, device_map=None
|
10 |
+
).to("cpu") # Ensure the model is on CPU
|
11 |
+
|
12 |
+
min_pixels = 256*28*28
|
13 |
+
max_pixels = 1280*28*28
|
14 |
+
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
|
15 |
+
|
16 |
+
# Streamlit app
|
17 |
+
st.title("OCR Application with Keyword Search")
|
18 |
+
|
19 |
+
# Upload image
|
20 |
+
uploaded_file = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
|
21 |
+
|
22 |
+
if uploaded_file is not None:
|
23 |
+
# Convert the uploaded file to an image
|
24 |
+
img = Image.open(uploaded_file)
|
25 |
+
|
26 |
+
# Display the uploaded image
|
27 |
+
st.image(img, caption="Uploaded Image", use_column_width=True)
|
28 |
+
|
29 |
+
# Prepare the image for the model
|
30 |
+
messages = [
|
31 |
+
{
|
32 |
+
"role": "system",
|
33 |
+
"content": "You are an AI assistant specialized in reading and extracting text from images. Your task is to report the actual words and characters visible in the image, exactly as they appear, maintaining the original language (Hindi or English)."
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"role": "user",
|
37 |
+
"content": [
|
38 |
+
{
|
39 |
+
"type": "image",
|
40 |
+
"image": img, # Pass the image object directly
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"type": "text",
|
44 |
+
"text": "Read and extract ALL text visible in this image. Provide ONLY the actual words, numbers, and characters you see, exactly as they appear."
|
45 |
+
},
|
46 |
+
],
|
47 |
+
}
|
48 |
+
]
|
49 |
+
|
50 |
+
# Process the image for inference
|
51 |
+
text = processor.apply_chat_template(
|
52 |
+
messages, tokenize=False, add_generation_prompt=True
|
53 |
+
)
|
54 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
55 |
+
inputs = processor(
|
56 |
+
text=[text],
|
57 |
+
images=image_inputs,
|
58 |
+
videos=video_inputs,
|
59 |
+
padding=True,
|
60 |
+
return_tensors="pt",
|
61 |
+
)
|
62 |
+
inputs = inputs.to("cpu") # Send the inputs to CPU
|
63 |
+
|
64 |
+
# Inference on CPU
|
65 |
+
generated_ids = model.generate(**inputs, max_new_tokens=200)
|
66 |
+
generated_ids_trimmed = [
|
67 |
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
68 |
+
]
|
69 |
+
output_text = processor.batch_decode(
|
70 |
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
71 |
+
)
|
72 |
+
|
73 |
+
# Display the extracted text
|
74 |
+
extracted_text = output_text[0]
|
75 |
+
st.subheader("Extracted Text")
|
76 |
+
st.write(extracted_text)
|
77 |
+
|
78 |
+
# Keyword Search
|
79 |
+
keyword = st.text_input("Enter keyword to search in the extracted text")
|
80 |
+
if keyword:
|
81 |
+
if keyword.lower() in extracted_text.lower():
|
82 |
+
highlighted_text = extracted_text.replace(keyword, f"**{keyword}**")
|
83 |
+
st.subheader("Keyword Found")
|
84 |
+
st.write(highlighted_text, unsafe_allow_html=True)
|
85 |
+
else:
|
86 |
+
st.write("Keyword not found in the extracted text.")
|