#imagetext-to-text import gradio as gr import base64 from huggingface_hub import InferenceClient #client = InferenceClient('Qwen/Qwen2.5-VL-7B-Instruct') #client = InferenceClient("mistralai/Pixtral-12B-Base-2409") client = InferenceClient('meta-llama/Llama-3.2-11B-Vision-Instruct') def imageDescription(image, prompt): image_path="image.png" image.save(image_path) with open(image_path, "rb") as f: base64_image = base64.b64encode(f.read()).decode("utf-8") image_url = f"data:image/png;base64,{base64_image}" output = client.chat.completions.create(messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": image_url}, }, { "type": "text", "text": prompt, }, ], }, ], ) return output.choices[0].message.content with gr.Blocks(theme=gr.themes.Citrus()) as demo: with gr.Row(): with gr.Column(): #an image input image=gr.Image(type="pil", label="upload an immage") with gr.Column(): prompt = gr.Textbox(label="What would you like to know about this picture?",scale=1) describe_btn = gr.Button("Describe the image",scale=1) output = gr.Textbox(label="Description",scale=1) #sending two inputs to imageDescription function describe_btn.click(fn=imageDescription, inputs=[image, prompt], outputs=output) demo.launch(debug=True)