File size: 4,014 Bytes
639e3fa
 
 
795b781
639e3fa
96c0816
639e3fa
 
795b781
639e3fa
 
795b781
32e3531
795b781
96c0816
 
 
 
32e3531
96c0816
 
32e3531
 
 
639e3fa
795b781
639e3fa
96c0816
 
639e3fa
 
 
 
96c0816
639e3fa
96c0816
 
639e3fa
 
 
 
795b781
639e3fa
 
 
795b781
639e3fa
 
 
 
 
 
 
 
795b781
 
 
 
 
 
 
 
 
 
639e3fa
 
795b781
639e3fa
795b781
639e3fa
795b781
 
 
639e3fa
795b781
 
639e3fa
795b781
639e3fa
 
795b781
639e3fa
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import streamlit as st
from PIL import Image
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForImageToText
from colpali_engine.models import ColPali, ColPaliProcessor
from huggingface_hub import login
import os

# Set device for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get Hugging Face token from environment variables
hf_token = os.getenv('HF_TOKEN')

# Log in to Hugging Face Hub (this will authenticate globally)
login(token=hf_token)

# Load the processor and image-to-text model directly
try:
    processor_img_to_text = AutoProcessor.from_pretrained("google/paligemma-3b-mix-448")
    model_img_to_text = AutoModelForImageToText.from_pretrained("google/paligemma-3b-mix-448").to(device)
except Exception as e:
    st.error(f"Error loading image-to-text model: {e}")
    st.stop()

# Load ColPali model with Hugging Face token
try:
    model_colpali = ColPali.from_pretrained("vidore/colpali-v1.2", torch_dtype=torch.bfloat16).to(device)
    processor_colpali = ColPaliProcessor.from_pretrained("google/paligemma-3b-mix-448")
except Exception as e:
    st.error(f"Error loading ColPali model or processor: {e}")
    st.stop()

# Load Qwen model
try:
    model_qwen = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct").to(device)
    processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
except Exception as e:
    st.error(f"Error loading Qwen model or processor: {e}")
    st.stop()

# Streamlit UI
st.title("OCR and Document Search Web Application")
st.write("Upload an image containing text in both Hindi and English for OCR processing and keyword search.")

# File uploader for the image
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    try:
        image = Image.open(uploaded_file)
        st.image(image, caption='Uploaded Image.', use_column_width=True)
        st.write("")

        # Use the image-to-text model to extract text from the image
        inputs_img_to_text = processor_img_to_text(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            generated_ids_img_to_text = model_img_to_text.generate(**inputs_img_to_text, max_new_tokens=128)
            output_text_img_to_text = processor_img_to_text.batch_decode(generated_ids_img_to_text, skip_special_tokens=True, clean_up_tokenization_spaces=True)

        st.write("Extracted Text from Image:")
        st.write(output_text_img_to_text)

        # Prepare input for Qwen model for image description
        conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
        text_prompt = processor_qwen.apply_chat_template(conversation, add_generation_prompt=True)
        inputs_qwen = processor_qwen(text=[text_prompt], images=[image], padding=True, return_tensors="pt").to(device)

        # Generate response with Qwen model
        with torch.no_grad():
            output_ids_qwen = model_qwen.generate(**inputs_qwen, max_new_tokens=128)
            generated_ids_qwen = [output_ids_qwen[len(input_ids):] for input_ids, output_ids_qwen in zip(inputs_qwen.input_ids, output_ids_qwen)]
            output_text_qwen = processor_qwen.batch_decode(generated_ids_qwen, skip_special_tokens=True, clean_up_tokenization_spaces=True)

        st.write("Qwen Model Description:")
        st.write(output_text_qwen)

        # Keyword search in the extracted text
        keyword = st.text_input("Enter a keyword to search in the extracted text:")
        if keyword:
            if keyword.lower() in output_text_img_to_text[0].lower():
                st.write(f"Keyword '{keyword}' found in the text.")
            else:
                st.write(f"Keyword '{keyword}' not found in the text.")
    except Exception as e:
        st.error(f"An error occurred: {e}")

if __name__ == "__main__":
    st.write("Deploying the web application...")