Spaces:

hanzla
/

PlaygroundAyaVision

Running on Zero

App Files Files Community

mjavaid commited on 16 days ago

Commit

38746a1

1 Parent(s): 28691d0

first commit

Browse files

Files changed (1) hide show

app.py +29 -38

app.py CHANGED Viewed

@@ -2,13 +2,12 @@ import spaces
 import gradio as gr
 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText
-import requests
 import os
 hf_token = os.environ.get("HF_TOKEN")
 model_id = "CohereForAI/aya-vision-8b"
-# Load the model and processor during startup.
 try:
     processor = AutoProcessor.from_pretrained(model_id)
     model = AutoModelForImageTextToText.from_pretrained(
@@ -30,18 +29,18 @@ def process_image_and_prompt(uploaded_image, image_url, prompt, temperature=0.3,
     if processor is None or model is None:
         return "Model failed to load. Please check the logs."
-    # Determine which image to use:
-    # If an image is uploaded, it is returned as a file path.
-    if uploaded_image is not None:
-        # If the file path does not start with "http", prefix with '/file/' so that
-        # the Hugging Face Space can serve it via an HTTP URL.
-        img_url = uploaded_image if uploaded_image.startswith("http") else f"/file/{uploaded_image}"
     elif image_url and image_url.strip():
         img_url = image_url.strip()
     else:
         return "Please provide either an image upload or an image URL."
     # Build the message using the Aya Vision chat template.
     messages = [
         {
             "role": "user",
@@ -61,7 +60,7 @@ def process_image_and_prompt(uploaded_image, image_url, prompt, temperature=0.3,
             return_dict=True,
             return_tensors="pt"
         ).to(model.device)
         gen_tokens = model.generate(
             **inputs,
             max_new_tokens=int(max_tokens),
@@ -87,53 +86,45 @@ examples = [
 # Build the Gradio interface.
 with gr.Blocks(title="Aya Vision 8B Demo") as demo:
     gr.Markdown("# Aya Vision 8B Model Demo")
-    gr.Markdown("""
-    This app demonstrates the C4AI Aya Vision 8B model, an 8-billion parameter vision-language model with capabilities including:
-    - OCR (reading text from images)
-    - Image captioning
-    - Visual reasoning
-    - Question answering
-    - Support for 23 languages
-    Upload an image or provide a URL, and enter a prompt to get started!
-    """)
-    # Display model loading status.
     gr.Markdown(f"**Model Status:** {model_status}")
-    gr.Markdown("### Provide an image (upload or URL):")
     with gr.Tab("Upload Image"):
-        # Set type to 'filepath' to get the file path from the upload.
-        image_input = gr.Image(label="Upload Image", type="filepath")
     with gr.Tab("Image URL"):
-        image_url_input = gr.Textbox(label="Image URL", placeholder="Enter a URL to an image")
-    prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt to the model", lines=3)
     with gr.Accordion("Generation Settings", open=False):
-        temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
-        max_tokens = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
-    generate_button = gr.Button("Generate Response", variant="primary")
-    with gr.Column():
-        output = gr.Textbox(label="Model Response", lines=10)
     gr.Markdown("### Examples")
     gr.Examples(
         examples=examples,
-        inputs=[image_input, image_url_input, prompt, temperature, max_tokens],
         outputs=output,
         fn=process_image_and_prompt
     )
-    # Determine which image input to use when generating the response.
     def generate_response(uploaded_image, image_url, prompt, temperature, max_tokens):
         return process_image_and_prompt(uploaded_image, image_url, prompt, temperature, max_tokens)
-    generate_button.click(
         generate_response,
-        inputs=[image_input, image_url_input, prompt, temperature, max_tokens],
         outputs=output
     )

 import gradio as gr
 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText
 import os
 hf_token = os.environ.get("HF_TOKEN")
 model_id = "CohereForAI/aya-vision-8b"
+# Load the model and processor on startup.
 try:
     processor = AutoProcessor.from_pretrained(model_id)
     model = AutoModelForImageTextToText.from_pretrained(
     if processor is None or model is None:
         return "Model failed to load. Please check the logs."
+    # Determine which image input to use:
+    # If an image is uploaded, convert its file path to a URL.
+    if uploaded_image:
+        # Gradio returns a file path; if it doesn't start with "http", prefix it so that it is served.
+        img_url = uploaded_image if str(uploaded_image).startswith("http") else f"/file/{uploaded_image}"
     elif image_url and image_url.strip():
         img_url = image_url.strip()
     else:
         return "Please provide either an image upload or an image URL."
     # Build the message using the Aya Vision chat template.
+    # Note: Aya Vision requires the image to be sent as a URL.
     messages = [
         {
             "role": "user",
             return_dict=True,
             return_tensors="pt"
         ).to(model.device)
         gen_tokens = model.generate(
             **inputs,
             max_new_tokens=int(max_tokens),
 # Build the Gradio interface.
 with gr.Blocks(title="Aya Vision 8B Demo") as demo:
     gr.Markdown("# Aya Vision 8B Model Demo")
+    gr.Markdown(
+        """
+This app demonstrates the C4AI Aya Vision 8B model, which requires an image URL as input.
+You can either upload an image (it will be served as a URL) or provide a direct image URL.
+Enter a prompt along with the image to get started!
+        """
+    )
     gr.Markdown(f"**Model Status:** {model_status}")
+    gr.Markdown("### Provide an Image")
     with gr.Tab("Upload Image"):
+        # Using type="filepath" returns the local file path which is then converted into a URL.
+        image_upload = gr.Image(label="Upload Image", type="filepath")
     with gr.Tab("Image URL"):
+        image_url_input = gr.Textbox(label="Image URL", placeholder="Enter a direct image URL")
+    prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here", lines=3)
     with gr.Accordion("Generation Settings", open=False):
+        temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
+        max_tokens_slider = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
+    generate_btn = gr.Button("Generate Response", variant="primary")
+    output = gr.Textbox(label="Model Response", lines=10)
     gr.Markdown("### Examples")
     gr.Examples(
         examples=examples,
+        inputs=[image_upload, image_url_input, prompt, temperature_slider, max_tokens_slider],
         outputs=output,
         fn=process_image_and_prompt
     )
     def generate_response(uploaded_image, image_url, prompt, temperature, max_tokens):
         return process_image_and_prompt(uploaded_image, image_url, prompt, temperature, max_tokens)
+    generate_btn.click(
         generate_response,
+        inputs=[image_upload, image_url_input, prompt, temperature_slider, max_tokens_slider],
         outputs=output
     )