Spaces:

Saurabh1207
/

VLM

Sleeping

VLM

File size: 3,032 Bytes

763589d
 
30a00ea
 
c42cef3
763589d
e3dbfa9
f90e854
 
6d15eb6
 
 
763589d
e3dbfa9
eed28b8
e3dbfa9
f90e854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
763589d
 
 
e3dbfa9
6837700
 
763589d
 
 
e3dbfa9
 
 
 
763589d
e3dbfa9
 
 
763589d
e3dbfa9

from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import streamlit as st
import torch
from PIL import Image

# Default: Load the model on the available device(s)
@st.cache_resource
def init_qwen_model():
    _model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto")
    _processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
    return _model, _processor

# Modified function to use only the image as the argument
@st.cache_data
def get_qwen_text(uploaded_file, model, processor):
    if uploaded_file is not None:
        # Open the uploaded image file
        image = Image.open(uploaded_file)
        st.image(image, caption="Uploaded Image", use_column_width=True)
        
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": image,
                    },
                    {"type": "text", "text": "Run Optical Character recognition on the image."},
                ],
            }
        ]

        # Preparation for inference
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cpu")

        # Inference: Generation of the output
        generated_ids = model.generate(**inputs, max_new_tokens=128)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        return output_text
    
# Streamlit app title
st.title("OCR Image Text Extraction")

# Initialize the model and processor
MODEL, PROCESSOR = init_qwen_model()

# File uploader for images
uploaded_file = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])

if uploaded_file:
    st.subheader("Extracted Text:")
    output = get_qwen_text(uploaded_file, MODEL, PROCESSOR)
    st.write(output)

    # Keyword search functionality
    st.subheader("Keyword Search")
    search_query = st.text_input("Enter keywords to search within the extracted text")

    if search_query:
        # Check if the search query is in the extracted text
        if search_query.lower() in output.lower():
            highlighted_text = output.replace(search_query, f"**{search_query}**")
            st.write(f"Matching Text: {highlighted_text}")
        else:
            st.write("No matching text found.")
else:
    st.info("Please upload an image to extract text.")