File size: 4,901 Bytes
62e9e39
67bff2d
 
d58a265
b7430e6
d58a265
b7430e6
d58a265
 
38746a1
ed11a3f
 
 
 
 
 
 
 
 
 
 
 
 
2567c58
278a9c4
28691d0
d58a265
ed11a3f
d58a265
ed11a3f
28691d0
38746a1
 
7bd22d1
 
 
 
 
 
 
 
d58a265
7bd22d1
28691d0
7bd22d1
ed11a3f
 
28691d0
ed11a3f
 
7bd22d1
 
 
28691d0
d58a265
 
ed11a3f
 
 
 
 
d58a265
 
38746a1
d58a265
ed11a3f
 
 
d58a265
 
28691d0
ed11a3f
28691d0
 
ed11a3f
d58a265
 
 
 
28691d0
d58a265
 
 
 
 
67bff2d
28691d0
d58a265
 
38746a1
 
2649daa
38746a1
 
ed11a3f
38746a1
 
ed11a3f
7bd22d1
38746a1
ed11a3f
38746a1
28691d0
38746a1
28691d0
ed11a3f
38746a1
 
28691d0
38746a1
 
28691d0
d58a265
 
 
38746a1
d58a265
 
 
28691d0
 
 
 
38746a1
d58a265
38746a1
d58a265
 
67bff2d
d58a265
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import spaces
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
import os

hf_token = os.environ.get("HF_TOKEN")
model_id = "CohereForAI/aya-vision-8b"

# Load the model and processor on startup.
try:
    processor = AutoProcessor.from_pretrained(model_id)
    model = AutoModelForImageTextToText.from_pretrained(
        model_id, device_map="auto", torch_dtype=torch.float16, use_auth_token=hf_token
    )
    model_status = "Model loaded successfully!"
except Exception as e:
    processor = None
    model = None
    model_status = (
        f"Error loading model: {e}\nMake sure to install the correct version of transformers with: "
        "pip install 'git+https://github.com/huggingface/[email protected]'"
    )

@spaces.GPU
def process_image_and_prompt(uploaded_image, image_url, prompt, temperature=0.3, max_tokens=300):
    global processor, model

    if processor is None or model is None:
        return "Model failed to load. Please check the logs."
    
    # Determine which image input to use:
    if uploaded_image:
        # If an image is uploaded, use the image directly.
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": uploaded_image},
                {"type": "text", "text": prompt},
            ],
        }]
    elif image_url and image_url.strip():
        # Otherwise, use the provided image URL.
        img_url = image_url.strip()
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "url": img_url},
                {"type": "text", "text": prompt},
            ],
        }]
    else:
        return "Please provide either an image upload or an image URL."
    
    try:
        inputs = processor.apply_chat_template(
            messages,
            padding=True,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt"
        ).to(model.device)

        gen_tokens = model.generate(
            **inputs,
            max_new_tokens=int(max_tokens),
            do_sample=True,
            temperature=float(temperature),
        )
    
        response = processor.tokenizer.decode(
            gen_tokens[0][inputs.input_ids.shape[1]:],
            skip_special_tokens=True
        )
        return response
    except Exception as e:
        return f"Error generating response: {e}"

# Example inputs for testing.
examples = [
    [None, "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=", "What landmark is shown in this image?", 0.3, 300],
    [None, "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", "What does the text in this image say?", 0.3, 300],
    [None, "https://upload.wikimedia.org/wikipedia/commons/d/da/The_Parthenon_in_Athens.jpg", "Describe esta imagen en español", 0.3, 300]
]

# Build the Gradio interface.
with gr.Blocks(title="Aya Vision 8B Demo") as demo:
    gr.Markdown("# Aya Vision 8B Model Demo")
    gr.Markdown(
        """
This app demonstrates the Aya Vision 8B model. You can either upload an image or provide an image URL. Enter a prompt along with the image.
        """
    )
    gr.Markdown(f"**Model Status:** {model_status}")

    gr.Markdown("### Provide an Image")
    with gr.Tab("Upload Image"):
        # Using type="filepath" returns the local file path which is then passed directly.
        image_upload = gr.Image(label="Upload Image", type="filepath")
    with gr.Tab("Image URL"):
        image_url_input = gr.Textbox(label="Image URL", placeholder="Enter a direct image URL")
    
    prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here", lines=3)
    
    with gr.Accordion("Generation Settings", open=False):
        temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
        max_tokens_slider = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
    
    generate_btn = gr.Button("Generate Response", variant="primary")
    output = gr.Textbox(label="Model Response", lines=10)
    
    gr.Markdown("### Examples")
    gr.Examples(
        examples=examples,
        inputs=[image_upload, image_url_input, prompt, temperature_slider, max_tokens_slider],
        outputs=output,
        fn=process_image_and_prompt
    )
    
    def generate_response(uploaded_image, image_url, prompt, temperature, max_tokens):
        return process_image_and_prompt(uploaded_image, image_url, prompt, temperature, max_tokens)
    
    generate_btn.click(
        generate_response,
        inputs=[image_upload, image_url_input, prompt, temperature_slider, max_tokens_slider],
        outputs=output
    )

if __name__ == "__main__":
    demo.launch()