import gradio as gr from transformers import AutoProcessor, LlavaForConditionalGeneration from PIL import Image import torch import spaces # Load the Llava model and processor model_id = "mrcuddle/lumimaid-v0.2-8b-pixtral" processor = AutoProcessor.from_pretrained(model_id) model = LlavaForConditionalGeneration.from_pretrained(model_id).to("cuda") print(model.config) @spaces.GPU def generate_text(input_text="", image=None): if image is None: return "Please upload an image." # Resize the image to the expected resolution (adjust size if necessary) image = image.resize((336, 336)) # Use a default prompt if no text is provided if not input_text: input_text = "Describe the image." # Prepare inputs inputs = processor(text=input_text, images=image, return_tensors="pt").to("cuda") # Generate output outputs = model.generate(**inputs) generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] return generated_text # Create Gradio interface iface = gr.Interface( fn=generate_text, inputs=[gr.Textbox(label="Enter your text here (optional)", value=""), gr.Image(label="Upload an image", type="pil")], outputs=gr.Textbox(label="Generated Text"), title="Llava Model Interaction", description="Interact with the Llava model using text and image inputs. If no text is provided, the model will describe the image." ) # Launch the interface iface.launch()