mjavaid commited on
Commit
38746a1
·
1 Parent(s): 28691d0

first commit

Browse files
Files changed (1) hide show
  1. app.py +29 -38
app.py CHANGED
@@ -2,13 +2,12 @@ import spaces
2
  import gradio as gr
3
  import torch
4
  from transformers import AutoProcessor, AutoModelForImageTextToText
5
- import requests
6
  import os
7
 
8
  hf_token = os.environ.get("HF_TOKEN")
9
  model_id = "CohereForAI/aya-vision-8b"
10
 
11
- # Load the model and processor during startup.
12
  try:
13
  processor = AutoProcessor.from_pretrained(model_id)
14
  model = AutoModelForImageTextToText.from_pretrained(
@@ -30,18 +29,18 @@ def process_image_and_prompt(uploaded_image, image_url, prompt, temperature=0.3,
30
  if processor is None or model is None:
31
  return "Model failed to load. Please check the logs."
32
 
33
- # Determine which image to use:
34
- # If an image is uploaded, it is returned as a file path.
35
- if uploaded_image is not None:
36
- # If the file path does not start with "http", prefix with '/file/' so that
37
- # the Hugging Face Space can serve it via an HTTP URL.
38
- img_url = uploaded_image if uploaded_image.startswith("http") else f"/file/{uploaded_image}"
39
  elif image_url and image_url.strip():
40
  img_url = image_url.strip()
41
  else:
42
  return "Please provide either an image upload or an image URL."
43
-
44
  # Build the message using the Aya Vision chat template.
 
45
  messages = [
46
  {
47
  "role": "user",
@@ -61,7 +60,7 @@ def process_image_and_prompt(uploaded_image, image_url, prompt, temperature=0.3,
61
  return_dict=True,
62
  return_tensors="pt"
63
  ).to(model.device)
64
-
65
  gen_tokens = model.generate(
66
  **inputs,
67
  max_new_tokens=int(max_tokens),
@@ -87,53 +86,45 @@ examples = [
87
  # Build the Gradio interface.
88
  with gr.Blocks(title="Aya Vision 8B Demo") as demo:
89
  gr.Markdown("# Aya Vision 8B Model Demo")
90
- gr.Markdown("""
91
- This app demonstrates the C4AI Aya Vision 8B model, an 8-billion parameter vision-language model with capabilities including:
92
- - OCR (reading text from images)
93
- - Image captioning
94
- - Visual reasoning
95
- - Question answering
96
- - Support for 23 languages
97
-
98
- Upload an image or provide a URL, and enter a prompt to get started!
99
- """)
100
-
101
- # Display model loading status.
102
  gr.Markdown(f"**Model Status:** {model_status}")
103
-
104
- gr.Markdown("### Provide an image (upload or URL):")
105
  with gr.Tab("Upload Image"):
106
- # Set type to 'filepath' to get the file path from the upload.
107
- image_input = gr.Image(label="Upload Image", type="filepath")
108
  with gr.Tab("Image URL"):
109
- image_url_input = gr.Textbox(label="Image URL", placeholder="Enter a URL to an image")
110
 
111
- prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt to the model", lines=3)
112
 
113
  with gr.Accordion("Generation Settings", open=False):
114
- temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
115
- max_tokens = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
116
-
117
- generate_button = gr.Button("Generate Response", variant="primary")
118
 
119
- with gr.Column():
120
- output = gr.Textbox(label="Model Response", lines=10)
121
 
122
  gr.Markdown("### Examples")
123
  gr.Examples(
124
  examples=examples,
125
- inputs=[image_input, image_url_input, prompt, temperature, max_tokens],
126
  outputs=output,
127
  fn=process_image_and_prompt
128
  )
129
 
130
- # Determine which image input to use when generating the response.
131
  def generate_response(uploaded_image, image_url, prompt, temperature, max_tokens):
132
  return process_image_and_prompt(uploaded_image, image_url, prompt, temperature, max_tokens)
133
 
134
- generate_button.click(
135
  generate_response,
136
- inputs=[image_input, image_url_input, prompt, temperature, max_tokens],
137
  outputs=output
138
  )
139
 
 
2
  import gradio as gr
3
  import torch
4
  from transformers import AutoProcessor, AutoModelForImageTextToText
 
5
  import os
6
 
7
  hf_token = os.environ.get("HF_TOKEN")
8
  model_id = "CohereForAI/aya-vision-8b"
9
 
10
+ # Load the model and processor on startup.
11
  try:
12
  processor = AutoProcessor.from_pretrained(model_id)
13
  model = AutoModelForImageTextToText.from_pretrained(
 
29
  if processor is None or model is None:
30
  return "Model failed to load. Please check the logs."
31
 
32
+ # Determine which image input to use:
33
+ # If an image is uploaded, convert its file path to a URL.
34
+ if uploaded_image:
35
+ # Gradio returns a file path; if it doesn't start with "http", prefix it so that it is served.
36
+ img_url = uploaded_image if str(uploaded_image).startswith("http") else f"/file/{uploaded_image}"
 
37
  elif image_url and image_url.strip():
38
  img_url = image_url.strip()
39
  else:
40
  return "Please provide either an image upload or an image URL."
41
+
42
  # Build the message using the Aya Vision chat template.
43
+ # Note: Aya Vision requires the image to be sent as a URL.
44
  messages = [
45
  {
46
  "role": "user",
 
60
  return_dict=True,
61
  return_tensors="pt"
62
  ).to(model.device)
63
+
64
  gen_tokens = model.generate(
65
  **inputs,
66
  max_new_tokens=int(max_tokens),
 
86
  # Build the Gradio interface.
87
  with gr.Blocks(title="Aya Vision 8B Demo") as demo:
88
  gr.Markdown("# Aya Vision 8B Model Demo")
89
+ gr.Markdown(
90
+ """
91
+ This app demonstrates the C4AI Aya Vision 8B model, which requires an image URL as input.
92
+ You can either upload an image (it will be served as a URL) or provide a direct image URL.
93
+ Enter a prompt along with the image to get started!
94
+ """
95
+ )
 
 
 
 
 
96
  gr.Markdown(f"**Model Status:** {model_status}")
97
+
98
+ gr.Markdown("### Provide an Image")
99
  with gr.Tab("Upload Image"):
100
+ # Using type="filepath" returns the local file path which is then converted into a URL.
101
+ image_upload = gr.Image(label="Upload Image", type="filepath")
102
  with gr.Tab("Image URL"):
103
+ image_url_input = gr.Textbox(label="Image URL", placeholder="Enter a direct image URL")
104
 
105
+ prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here", lines=3)
106
 
107
  with gr.Accordion("Generation Settings", open=False):
108
+ temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
109
+ max_tokens_slider = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
 
 
110
 
111
+ generate_btn = gr.Button("Generate Response", variant="primary")
112
+ output = gr.Textbox(label="Model Response", lines=10)
113
 
114
  gr.Markdown("### Examples")
115
  gr.Examples(
116
  examples=examples,
117
+ inputs=[image_upload, image_url_input, prompt, temperature_slider, max_tokens_slider],
118
  outputs=output,
119
  fn=process_image_and_prompt
120
  )
121
 
 
122
  def generate_response(uploaded_image, image_url, prompt, temperature, max_tokens):
123
  return process_image_and_prompt(uploaded_image, image_url, prompt, temperature, max_tokens)
124
 
125
+ generate_btn.click(
126
  generate_response,
127
+ inputs=[image_upload, image_url_input, prompt, temperature_slider, max_tokens_slider],
128
  outputs=output
129
  )
130