diabolic6045 commited on
Commit
1c88632
·
verified ·
1 Parent(s): 8f12504

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -0
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoImageProcessor
4
+ from PIL import Image
5
+ import requests
6
+
7
+ # Load the model and tokenizer
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ model = AutoModelForVision2Seq.from_pretrained("stabilityai/japanese-stable-vlm", trust_remote_code=True, device_map='auto')
10
+ processor = AutoImageProcessor.from_pretrained("stabilityai/japanese-stable-vlm", device_map='auto')
11
+ tokenizer = AutoTokenizer.from_pretrained("stabilityai/japanese-stable-vlm", device_map='auto')
12
+
13
+ # Define the helper function to build prompts
14
+ TASK2INSTRUCTION = {
15
+ "caption": "画像を詳細に述べてください。",
16
+ "tag": "与えられた単語を使って、画像を詳細に述べてください。",
17
+ "vqa": "与えられた画像を下に、質問に答えてください。",
18
+ }
19
+
20
+ def build_prompt(task="caption", input=None, sep="\n\n### "):
21
+ assert task in TASK2INSTRUCTION, f"Please choose from {list(TASK2INSTRUCTION.keys())}"
22
+ if task in ["tag", "vqa"]:
23
+ assert input is not None, "Please fill in `input`!"
24
+ if task == "tag" and isinstance(input, list):
25
+ input = "、".join(input)
26
+ else:
27
+ assert input is None, f"`{task}` mode doesn't support to input questions"
28
+ sys_msg = "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。"
29
+ p = sys_msg
30
+ roles = ["指示", "応答"]
31
+ instruction = TASK2INSTRUCTION[task]
32
+ msgs = [": \n" + instruction, ": \n"]
33
+ if input:
34
+ roles.insert(1, "入力")
35
+ msgs.insert(1, ": \n" + input)
36
+ for role, msg in zip(roles, msgs):
37
+ p += sep + role + msg
38
+ return p
39
+
40
+ # Define the function to generate text from the image and prompt
41
+ @spaces.GPU(duration=120)
42
+ def generate_text(image, task, input_text=None):
43
+ prompt = build_prompt(task=task, input=input_text)
44
+ inputs = processor(images=image, return_tensors="pt")
45
+ text_encoding = tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
46
+ inputs.update(text_encoding)
47
+ outputs = model.generate(
48
+ **inputs.to(device=device, dtype=model.dtype),
49
+ do_sample=False,
50
+ num_beams=5,
51
+ max_new_tokens=128,
52
+ min_length=1,
53
+ repetition_penalty=1.5,
54
+ )
55
+ generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip()
56
+ return generated_text
57
+
58
+ # Define the Gradio interface
59
+ image_input = gr.Image(label="Upload an image")
60
+ task_input = gr.Radio(choices=["caption", "tag", "vqa"], value="caption", label="Select a task")
61
+ text_input = gr.Textbox(label="Enter text (for tag or vqa tasks)")
62
+
63
+ output = gr.Textbox(label="Generated text")
64
+
65
+ interface = gr.Interface(
66
+ fn=generate_text,
67
+ inputs=[image_input, task_input, text_input],
68
+ outputs=output,
69
+ examples=[
70
+ ["examples/example_image.jpg", "caption", None],
71
+ ["examples/example_image.jpg", "tag", "河津桜、青空"],
72
+ ["examples/example_image.jpg", "vqa", "OCRはできますか?"],
73
+ ],
74
+ )
75
+
76
+ interface.launch()