velvet / app.py
dinhanhx's picture
Move to cuda
fb5ec17 verified
import gradio as gr
import spaces
from standalone_velvet import setup_models
models_dict = setup_models("visual_bloom.torch")
visual_bloom = models_dict["visual_bloom"].to('cuda')
tokenizer = models_dict["tokenizer"]
image_feature_collator = models_dict["image_feature_collator"]
@spaces.GPU
def run_inference(text_input, image_input):
image_features, image_attentions = image_feature_collator([image_input])
instruction_inputs = tokenizer([text_input], return_tensors="pt")
language_output = visual_bloom.generate(
image_features.to('cuda'),
image_attentions.to('cuda'),
instruction_inputs["input_ids"].to('cuda'),
instruction_inputs["attention_mask"].to('cuda'),
)
human_output = tokenizer.decode(language_output[0], skip_special_tokens=True)
return human_output.split(".")[0]
if __name__ == "__main__":
markdown = """
# Quick introduction
We have proposed a prompting vision language model.
The model can caption images and answer questions related to images.
It is trained on CC3M, COCO, VQAv2, OK-VQA, TextCaps, TextVQA.
As the result of using Google Translate,
these datasets collectively contain millions of image-text pairs in English and Vietnamese.
For further details, please refer to [Velvet](https://github.com/dinhanhx/velvet?tab=readme-ov-file#introduction).
# Usage
## Run with pre-defined examples
1. Scroll to bottom of the page to see the examples.
2. Click one of them.
3. Click the `Run Inference` button.
## Run with user-defined inputs
### 1. Prepare text input
Image captioning:
- `Generate caption in en:`
- `Generate caption in vi:`
Visual question answering:
- `Generate answer in en: <question>?`
- `Generate answer in vi: <question>?`
Don't forget to replace `<question>` with your own question either in English or Vietnamese.
To write the prompt, one can refer to the examples at the bottom of the page.
### 2. Prepare image input
You can do as said in Image Input box. Wide range of image types are supported by PIL.
### 3. Click the `Run Inference` button
"""
examples = [
["Generate caption in en:", "examples/cat.png"],
["Generate caption in vi:", "examples/cat.png"],
["Generate answer in en: what is the color of the cat?", "examples/cat.png"],
["Generate answer in vi: màu sắc của con mèo là gì?", "examples/cat.png"],
]
with gr.Blocks() as demo:
gr.Markdown(markdown)
text_input = gr.Textbox(label="Text Input")
image_input = gr.Image(label="Image Input", type="pil")
text_output = gr.Textbox(label="Text Output")
infer_button = gr.Button("Run Inference")
infer_button.click(
run_inference, inputs=[text_input, image_input], outputs=text_output
)
examples = gr.Examples(
examples=examples,
inputs=[text_input, image_input],
)
demo.launch()