Rathapoom commited on
Commit
89f9139
·
verified ·
1 Parent(s): 4ffaf2a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import transformers
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from PIL import Image
5
+ import requests
6
+ import gradio as gr
7
+
8
+ # Load model and tokenizer
9
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
10
+ model = AutoModelForCausalLM.from_pretrained(
11
+ 'scb10x/llama-3-typhoon-v1.5-8b-instruct-vision-preview',
12
+ torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
13
+ device_map='auto',
14
+ trust_remote_code=True
15
+ ).to(device)
16
+
17
+ tokenizer = AutoTokenizer.from_pretrained(
18
+ 'scb10x/llama-3-typhoon-v1.5-8b-instruct-vision-preview',
19
+ trust_remote_code=True
20
+ )
21
+
22
+ def prepare_inputs(text, image, device='cuda'):
23
+ messages = [
24
+ {"role": "system", "content": "You are a helpful vision-capable assistant who eagerly converses with the user in their language."},
25
+ ]
26
+ messages.append({"role": "user", "content": "<|image|>\n" + text})
27
+
28
+ inputs_formatted = tokenizer.apply_chat_template(
29
+ messages,
30
+ add_generation_prompt=True,
31
+ tokenize=False
32
+ )
33
+
34
+ text_chunks = [tokenizer(chunk).input_ids for chunk in inputs_formatted.split('<|image|>')]
35
+ input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0).to(device)
36
+ attention_mask = torch.ones_like(input_ids).to(device)
37
+
38
+ return input_ids, attention_mask
39
+
40
+ # Inference function
41
+ def predict(prompt, img_url):
42
+ try:
43
+ image = Image.open(requests.get(img_url, stream=True).raw)
44
+ image_tensor = model.process_images([image], model.config).to(dtype=model.dtype, device=device)
45
+
46
+ input_ids, attention_mask = prepare_inputs(prompt, image, device=device)
47
+
48
+ output_ids = model.generate(
49
+ input_ids,
50
+ images=image_tensor,
51
+ max_new_tokens=100,
52
+ use_cache=True,
53
+ temperature=0.2,
54
+ top_p=0.2,
55
+ repetition_penalty=1.0
56
+ )[0]
57
+
58
+ result = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
59
+ return result
60
+ except Exception as e:
61
+ return str(e)
62
+
63
+ # Gradio Interface
64
+ inputs = [
65
+ gr.Textbox(label="Prompt", placeholder="Ask about the food in the image"),
66
+ gr.Textbox(label="Image URL", placeholder="Enter an image URL")
67
+ ]
68
+
69
+ outputs = gr.Textbox(label="Generated Output")
70
+
71
+ gr.Interface(
72
+ fn=predict, inputs=inputs, outputs=outputs, title="Food Image AI Assistant",
73
+ description="This model can analyze food images and answer questions about them."
74
+ ).launch()