File size: 2,782 Bytes
89f9139 b7450f3 89f9139 b7450f3 b6d1a72 b7450f3 b6d1a72 b7450f3 89f9139 b7450f3 89f9139 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import requests
import gradio as gr
import spaces # Import Hugging Face Spaces package
# Load model and tokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'scb10x/llama-3-typhoon-v1.5-8b-instruct-vision-preview'
@spaces.GPU(duration=60) # Decorate the function to dynamically request and release GPU
def load_model():
model = AutoModelForCausalLM.from_pretrained(
model_name,
revision='main', # Or a specific commit hash
torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
device_map='auto',
trust_remote_code=True
)
return model
model = load_model()
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
def prepare_inputs(text, image, device='cuda'):
messages = [
{"role": "system", "content": "You are a helpful vision-capable assistant who eagerly converses with the user in their language."},
]
messages.append({"role": "user", "content": "<|image|>\n" + text})
inputs_formatted = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False
)
text_chunks = [tokenizer(chunk).input_ids for chunk in inputs_formatted.split('<|image|>')]
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0).to(device)
attention_mask = torch.ones_like(input_ids).to(device)
return input_ids, attention_mask
@spaces.GPU(duration=60) # Decorate the function for GPU use
def predict(prompt, img_url):
try:
image = Image.open(requests.get(img_url, stream=True).raw)
image_tensor = model.process_images([image], model.config).to(dtype=model.dtype, device=device)
input_ids, attention_mask = prepare_inputs(prompt, image, device=device)
output_ids = model.generate(
input_ids,
images=image_tensor,
max_new_tokens=100,
use_cache=True,
temperature=0.2,
top_p=0.2,
repetition_penalty=1.0
)[0]
result = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
return result
except Exception as e:
return str(e)
# Gradio Interface
inputs = [
gr.Textbox(label="Prompt", placeholder="Ask about the food in the image"),
gr.Textbox(label="Image URL", placeholder="Enter an image URL")
]
outputs = gr.Textbox(label="Generated Output")
gr.Interface(
fn=predict, inputs=inputs, outputs=outputs, title="Food Image AI Assistant",
description="This model can analyze food images and answer questions about them."
).launch()
|