Spaces:

ruslanmv
/

Llama-3.2-11B-Vision-Instruct

Paused

ruslanmv commited on Oct 2, 2024

Commit

f3700d6

1 Parent(s): b39a5c0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -53,7 +53,11 @@ def predict(image, text):
     # Decode the output to return the final response
     response = processor.decode(outputs[0], skip_special_tokens=True)
-    return response
 # Define the Gradio interface
 interface = gr.Interface(
@@ -62,7 +66,7 @@ interface = gr.Interface(
         gr.Image(type="pil", label="Image Input"),  # Image input with label
         gr.Textbox(label="Text Input")  # Textbox input with label
     ],
-    outputs=gr.Textbox(label="Generated Response"),  # Output with a more descriptive label
     title="Llama 3.2 11B Vision Instruct Demo",  # Title of the interface
     description="This demo uses Meta's Llama 3.2 11B Vision model to generate responses based on an image and text input.",  # Short description
     theme="compact"  # Using a compact theme for a cleaner look

     # Decode the output to return the final response
     response = processor.decode(outputs[0], skip_special_tokens=True)
+    # Format the conversation for a better appearance
+    formatted_response = f"User: {text}\n\nAssistant: {response}"
+    return formatted_response
 # Define the Gradio interface
 interface = gr.Interface(
         gr.Image(type="pil", label="Image Input"),  # Image input with label
         gr.Textbox(label="Text Input")  # Textbox input with label
     ],
+    outputs=gr.Textbox(label="Conversation"),  # Output with a more descriptive label
     title="Llama 3.2 11B Vision Instruct Demo",  # Title of the interface
     description="This demo uses Meta's Llama 3.2 11B Vision model to generate responses based on an image and text input.",  # Short description
     theme="compact"  # Using a compact theme for a cleaner look