dhanilka commited on
Commit
889a17f
Β·
1 Parent(s): dd74106

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -0
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import FuyuForCausalLM, AutoTokenizer
4
+ from transformers.models.fuyu.processing_fuyu import FuyuProcessor
5
+ from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
6
+ from PIL import Image
7
+
8
+ model_id = "adept/fuyu-8b"
9
+ dtype = torch.bfloat16
10
+
11
+
12
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
13
+ model = FuyuForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=dtype)
14
+ processor = FuyuProcessor(image_processor=FuyuImageProcessor(), tokenizer=tokenizer)
15
+
16
+ caption_prompt = "Generate a coco-style caption.\\n"
17
+
18
+ def resize_to_max(image, max_width=1080, max_height=1080):
19
+ width, height = image.size
20
+ if width <= max_width and height <= max_height:
21
+ return image
22
+
23
+ scale = min(max_width/width, max_height/height)
24
+ width = int(width*scale)
25
+ height = int(height*scale)
26
+
27
+ return image.resize((width, height), Image.LANCZOS)
28
+
29
+ def predict(image, prompt):
30
+ # image = image.convert('RGB')
31
+ image = resize_to_max(image)
32
+
33
+ model_inputs = processor(text=prompt, images=[image])
34
+ model_inputs = {k: v.to(dtype=dtype if torch.is_floating_point(v) else v.dtype, device=device) for k,v in model_inputs.items()}
35
+
36
+ generation_output = model.generate(**model_inputs, max_new_tokens=40)
37
+ prompt_len = model_inputs["input_ids"].shape[-1]
38
+ return tokenizer.decode(generation_output[0][prompt_len:], skip_special_tokens=True)
39
+
40
+ def caption(image):
41
+ return predict(image, caption_prompt)
42
+
43
+ def set_example_image(example: list) -> dict:
44
+ return gr.Image.update(value=example[0])
45
+
46
+
47
+
48
+ css = """
49
+ #mkd {
50
+ height: 500px;
51
+ overflow: auto;
52
+ border: 1px solid #ccc;
53
+ }
54
+ """
55
+
56
+ with gr.Blocks(css=css) as demo:
57
+ gr.HTML(
58
+ """
59
+ <h1 id="title">Fuyu Multimodal Demo</h1>
60
+ <h3><a href="https://hf.co/adept/fuyu-8b">Fuyu-8B</a> is a multimodal model that supports a variety of tasks combining text and image prompts.</h3>
61
+ For example, you can use it for captioning by asking it to describe an image. You can also ask it questions about an image, a task known as Visual Question Answering, or VQA. This demo lets you explore captioning and VQA, with more tasks coming soon :)
62
+ Learn more about the model in <a href="https://www.adept.ai/blog/fuyu-8b">our blog post</a>.
63
+ <br>
64
+ <br>
65
+ <strong>Note: This is a raw model release. We have not added further instruction-tuning, postprocessing or sampling strategies to control for undesirable outputs. The model may hallucinate, and you should expect to have to fine-tune the model for your use-case!</strong>
66
+ <h3>Play with Fuyu-8B in this demo! πŸ’¬</h3>
67
+ """
68
+ )
69
+ with gr.Tab("Visual Question Answering"):
70
+ with gr.Row():
71
+ with gr.Column():
72
+ image_input = gr.Image(label="Upload your Image", type="pil")
73
+ text_input = gr.Textbox(label="Ask a Question")
74
+ vqa_output = gr.Textbox(label="Output")
75
+
76
+ vqa_btn = gr.Button("Answer Visual Question")
77
+
78
+ gr.Examples(
79
+ [["assets/vqa_example_1.png", "How is this made?"], ["assets/vqa_example_2.png", "What is this flower and where is it's origin?"]],
80
+ inputs = [image_input, text_input],
81
+ outputs = [vqa_output],
82
+ fn=predict,
83
+ cache_examples=True,
84
+ label='Click on any Examples below to get VQA results quickly πŸ‘‡'
85
+ )
86
+
87
+
88
+ with gr.Tab("Image Captioning"):
89
+ with gr.Row():
90
+ captioning_input = gr.Image(label="Upload your Image", type="pil")
91
+ captioning_output = gr.Textbox(label="Output")
92
+ captioning_btn = gr.Button("Generate Caption")
93
+
94
+ gr.Examples(
95
+ [["assets/captioning_example_1.png"], ["assets/captioning_example_2.png"]],
96
+ inputs = [captioning_input],
97
+ outputs = [captioning_output],
98
+ fn=caption,
99
+ cache_examples=True,
100
+ label='Click on any Examples below to get captioning results quickly πŸ‘‡'
101
+ )
102
+
103
+ captioning_btn.click(fn=caption, inputs=captioning_input, outputs=captioning_output)
104
+ vqa_btn.click(fn=predict, inputs=[image_input, text_input], outputs=vqa_output)
105
+
106
+
107
+ demo.launch(server_name="0.0.0.0")