import gradio as gr from gradio_client import Client, handle_file from huggingface_hub import HfApi import time api = HfApi() repo_ids = ["HuggingFaceH4/idefics2-8b-playground", "HuggingFaceH4/idefics2-8b-vdpoed-playground"] for repo_id in repo_ids: if api.space_info(repo_id).runtime.stage not in ["RUNNING", "APP_STARTING", "RUNNING_APP_STARTING"]: api.restart_space(repo_id="HuggingFaceH4/idefics2-8b-playground") for repo_id in repo_ids: while api.space_info(repo_id).runtime.stage != "RUNNING": time.sleep(1) client_idefics2 = Client("HuggingFaceH4/idefics2-8b-playground") client_idefics2_dpoed = Client("HuggingFaceH4/idefics2-8b-vdpoed-playground") def respond(multimodal_input): x = {"text": multimodal_input["text"], "files": [handle_file(file) for file in multimodal_input["files"]]} text_1 = client_idefics2.predict(x, api_name="/predict") text_2 = client_idefics2_dpoed.predict(x, api_name="/predict") return text_1, text_2 gr.Interface( respond, title="Compare IDEFICS2-8B Against DPO", description="Compare IDEFICS2-8B against DPO fine-tuned IDEFICS2-8B in this demo. Learn more about vision language model DPO in this [blog](https://huggingface.co/blog/dpo_vlm).", inputs=[gr.MultimodalTextbox(file_types=["image"], show_label=False)], outputs=[gr.Textbox(label="idefics2-8b"), gr.Textbox(label="idefics2-8b-dpoed")], examples=[{"text": "What is the type of flower in the image and what insect is on it?", "files": ["./bee.jpg"]}, {"text": "Describe the image", "files": ["./howl.jpg"]}], cache_examples=False, ).launch()