import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText import torch from PIL import Image # Load the processor and model processor = AutoProcessor.from_pretrained("guneetsk99/finance_qwen_VL_7B") model = AutoModelForImageTextToText.from_pretrained("guneetsk99/finance_qwen_VL_7B") def predict(input_img, text_prompt): # Preprocess the image and text prompt inputs = processor(images=input_img, text=text_prompt, return_tensors="pt").to(model.device) # Generate predictions using the model with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=50) # Decode the generated text generated_text = processor.decode(outputs[0], skip_special_tokens=True) return input_img, generated_text # Create the Gradio interface gradio_app = gr.Interface( fn=predict, inputs=[ gr.Image(label="Upload Image", source="upload", type="pil"), gr.Textbox(label="Text Prompt", placeholder="Enter a text prompt, e.g., 'Describe this image.'"), ], outputs=[ gr.Image(label="Uploaded Image"), gr.Textbox(label="Generated Response"), ], title="Finance Image-to-Text Model", description="Upload a financial document image and provide a text prompt for the model to process the image and generate a text response.", ) if __name__ == "__main__": gradio_app.launch()