ayush2607 commited on
Commit
52f121a
·
verified ·
1 Parent(s): 99db298

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -0
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
3
+ from qwen_vl_utils import process_vision_info
4
+ import torch
5
+ from PIL import Image
6
+
7
+ # Load model on CPU
8
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
9
+ "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.float32, device_map=None
10
+ ).to("cpu") # Ensure the model is on CPU
11
+
12
+ min_pixels = 256*28*28
13
+ max_pixels = 1280*28*28
14
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
15
+
16
+ # Streamlit app
17
+ st.title("OCR Application with Keyword Search")
18
+
19
+ # Upload image
20
+ uploaded_file = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
21
+
22
+ if uploaded_file is not None:
23
+ # Convert the uploaded file to an image
24
+ img = Image.open(uploaded_file)
25
+
26
+ # Display the uploaded image
27
+ st.image(img, caption="Uploaded Image", use_column_width=True)
28
+
29
+ # Prepare the image for the model
30
+ messages = [
31
+ {
32
+ "role": "system",
33
+ "content": "You are an AI assistant specialized in reading and extracting text from images. Your task is to report the actual words and characters visible in the image, exactly as they appear, maintaining the original language (Hindi or English)."
34
+ },
35
+ {
36
+ "role": "user",
37
+ "content": [
38
+ {
39
+ "type": "image",
40
+ "image": img, # Pass the image object directly
41
+ },
42
+ {
43
+ "type": "text",
44
+ "text": "Read and extract ALL text visible in this image. Provide ONLY the actual words, numbers, and characters you see, exactly as they appear."
45
+ },
46
+ ],
47
+ }
48
+ ]
49
+
50
+ # Process the image for inference
51
+ text = processor.apply_chat_template(
52
+ messages, tokenize=False, add_generation_prompt=True
53
+ )
54
+ image_inputs, video_inputs = process_vision_info(messages)
55
+ inputs = processor(
56
+ text=[text],
57
+ images=image_inputs,
58
+ videos=video_inputs,
59
+ padding=True,
60
+ return_tensors="pt",
61
+ )
62
+ inputs = inputs.to("cpu") # Send the inputs to CPU
63
+
64
+ # Inference on CPU
65
+ generated_ids = model.generate(**inputs, max_new_tokens=200)
66
+ generated_ids_trimmed = [
67
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
68
+ ]
69
+ output_text = processor.batch_decode(
70
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
71
+ )
72
+
73
+ # Display the extracted text
74
+ extracted_text = output_text[0]
75
+ st.subheader("Extracted Text")
76
+ st.write(extracted_text)
77
+
78
+ # Keyword Search
79
+ keyword = st.text_input("Enter keyword to search in the extracted text")
80
+ if keyword:
81
+ if keyword.lower() in extracted_text.lower():
82
+ highlighted_text = extracted_text.replace(keyword, f"**{keyword}**")
83
+ st.subheader("Keyword Found")
84
+ st.write(highlighted_text, unsafe_allow_html=True)
85
+ else:
86
+ st.write("Keyword not found in the extracted text.")