Spaces:

hanzla
/

PlaygroundAyaVision

Running on Zero

File size: 4,901 Bytes

62e9e39
67bff2d
 
d58a265
b7430e6
d58a265
b7430e6
d58a265
 
38746a1
ed11a3f
 
 
 
 
 
 
 
 
 
 
 
 
2567c58
278a9c4
28691d0
d58a265
ed11a3f
d58a265
ed11a3f
28691d0
38746a1
 
7bd22d1
 
 
 
 
 
 
 
d58a265
7bd22d1
28691d0
7bd22d1
ed11a3f
 
28691d0
ed11a3f
 
7bd22d1
 
 
28691d0
d58a265
 
ed11a3f
 
 
 
 
d58a265
 
38746a1
d58a265
ed11a3f
 
 
d58a265
 
28691d0
ed11a3f
28691d0
 
ed11a3f
d58a265
 
 
 
28691d0
d58a265
 
 
 
 
67bff2d
28691d0
d58a265
 
38746a1
 
2649daa
38746a1
 
ed11a3f
38746a1
 
ed11a3f
7bd22d1
38746a1
ed11a3f
38746a1
28691d0
38746a1
28691d0
ed11a3f
38746a1
 
28691d0
38746a1
 
28691d0
d58a265
 
 
38746a1
d58a265
 
 
28691d0
 
 
 
38746a1
d58a265
38746a1
d58a265
 
67bff2d
d58a265

import spaces
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
import os

hf_token = os.environ.get("HF_TOKEN")
model_id = "CohereForAI/aya-vision-8b"

# Load the model and processor on startup.
try:
    processor = AutoProcessor.from_pretrained(model_id)
    model = AutoModelForImageTextToText.from_pretrained(
        model_id, device_map="auto", torch_dtype=torch.float16, use_auth_token=hf_token
    )
    model_status = "Model loaded successfully!"
except Exception as e:
    processor = None
    model = None
    model_status = (
        f"Error loading model: {e}\nMake sure to install the correct version of transformers with: "
        "pip install 'git+https://github.com/huggingface/[email protected]'"
    )

@spaces.GPU
def process_image_and_prompt(uploaded_image, image_url, prompt, temperature=0.3, max_tokens=300):
    global processor, model

    if processor is None or model is None:
        return "Model failed to load. Please check the logs."
    
    # Determine which image input to use:
    if uploaded_image:
        # If an image is uploaded, use the image directly.
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": uploaded_image},
                {"type": "text", "text": prompt},
            ],
        }]
    elif image_url and image_url.strip():
        # Otherwise, use the provided image URL.
        img_url = image_url.strip()
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "url": img_url},
                {"type": "text", "text": prompt},
            ],
        }]
    else:
        return "Please provide either an image upload or an image URL."
    
    try:
        inputs = processor.apply_chat_template(
            messages,
            padding=True,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt"
        ).to(model.device)

        gen_tokens = model.generate(
            **inputs,
            max_new_tokens=int(max_tokens),
            do_sample=True,
            temperature=float(temperature),
        )
    
        response = processor.tokenizer.decode(
            gen_tokens[0][inputs.input_ids.shape[1]:],
            skip_special_tokens=True
        )
        return response
    except Exception as e:
        return f"Error generating response: {e}"

# Example inputs for testing.
examples = [
    [None, "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=", "What landmark is shown in this image?", 0.3, 300],
    [None, "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", "What does the text in this image say?", 0.3, 300],
    [None, "https://upload.wikimedia.org/wikipedia/commons/d/da/The_Parthenon_in_Athens.jpg", "Describe esta imagen en español", 0.3, 300]
]

# Build the Gradio interface.
with gr.Blocks(title="Aya Vision 8B Demo") as demo:
    gr.Markdown("# Aya Vision 8B Model Demo")
    gr.Markdown(
        """
This app demonstrates the Aya Vision 8B model. You can either upload an image or provide an image URL. Enter a prompt along with the image.
        """
    )
    gr.Markdown(f"**Model Status:** {model_status}")

    gr.Markdown("### Provide an Image")
    with gr.Tab("Upload Image"):
        # Using type="filepath" returns the local file path which is then passed directly.
        image_upload = gr.Image(label="Upload Image", type="filepath")
    with gr.Tab("Image URL"):
        image_url_input = gr.Textbox(label="Image URL", placeholder="Enter a direct image URL")
    
    prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here", lines=3)
    
    with gr.Accordion("Generation Settings", open=False):
        temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
        max_tokens_slider = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
    
    generate_btn = gr.Button("Generate Response", variant="primary")
    output = gr.Textbox(label="Model Response", lines=10)
    
    gr.Markdown("### Examples")
    gr.Examples(
        examples=examples,
        inputs=[image_upload, image_url_input, prompt, temperature_slider, max_tokens_slider],
        outputs=output,
        fn=process_image_and_prompt
    )
    
    def generate_response(uploaded_image, image_url, prompt, temperature, max_tokens):
        return process_image_and_prompt(uploaded_image, image_url, prompt, temperature, max_tokens)
    
    generate_btn.click(
        generate_response,
        inputs=[image_upload, image_url_input, prompt, temperature_slider, max_tokens_slider],
        outputs=output
    )

if __name__ == "__main__":
    demo.launch()