import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
from PIL import Image

# Load the processor and model
processor = AutoProcessor.from_pretrained("guneetsk99/finance_qwen_VL_7B")
model = AutoModelForImageTextToText.from_pretrained("guneetsk99/finance_qwen_VL_7B")

def predict(input_img, text_prompt):
    # Preprocess the image and text prompt
    inputs = processor(images=input_img, text=text_prompt, return_tensors="pt").to(model.device)
    
    # Generate predictions using the model
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50)
    
    # Decode the generated text
    generated_text = processor.decode(outputs[0], skip_special_tokens=True)
    
    return input_img, generated_text

# Create the Gradio interface
gradio_app = gr.Interface(
    fn=predict,
    inputs=[
        gr.Image(label="Upload Image", source="upload", type="pil"),
        gr.Textbox(label="Text Prompt", placeholder="Enter a text prompt, e.g., 'Describe this image.'"),
    ],
    outputs=[
        gr.Image(label="Uploaded Image"),
        gr.Textbox(label="Generated Response"),
    ],
    title="Finance Image-to-Text Model",
    description="Upload a financial document image and provide a text prompt for the model to process the image and generate a text response.",
)

if __name__ == "__main__":
    gradio_app.launch()