OtterHD-Demo / app.py
luodian's picture
add app
f49524d
raw
history blame
4.83 kB
import os
import datetime
import json
import base64
from PIL import Image
import gradio as gr
import hashlib
import requests
from utils import build_logger
import io
LOGDIR = "log"
logger = build_logger("otter", LOGDIR)
no_change_btn = gr.Button.update()
enable_btn = gr.Button.update(interactive=True)
disable_btn = gr.Button.update(interactive=False)
def decode_image(encoded_image: str) -> Image:
decoded_bytes = base64.b64decode(encoded_image.encode("utf-8"))
buffer = io.BytesIO(decoded_bytes)
image = Image.open(buffer)
return image
def encode_image(image: Image.Image, format: str = "PNG") -> str:
with io.BytesIO() as buffer:
image.save(buffer, format=format)
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
return encoded_image
def get_conv_log_filename():
t = datetime.datetime.now()
name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
return name
def get_conv_image_dir():
name = os.path.join(LOGDIR, "images")
os.makedirs(name, exist_ok=True)
return name
def get_image_name(image, image_dir=None):
buffer = io.BytesIO()
image.save(buffer, format="PNG")
image_bytes = buffer.getvalue()
md5 = hashlib.md5(image_bytes).hexdigest()
if image_dir is not None:
image_name = os.path.join(image_dir, md5 + ".png")
else:
image_name = md5 + ".png"
return image_name
def resize_image(image, max_size):
width, height = image.size
aspect_ratio = float(width) / float(height)
if width > height:
new_width = max_size
new_height = int(new_width / aspect_ratio)
else:
new_height = max_size
new_width = int(new_height * aspect_ratio)
resized_image = image.resize((new_width, new_height))
return resized_image
def http_bot(image_input, text_input, request: gr.Request):
logger.info(f"http_bot. ip: {request.client.host}")
print(f"Prompt request: {text_input}")
base64_image_str = encode_image(image_input)
payload = {
"content": [
{
"prompt": text_input,
"image": base64_image_str,
}
],
"token": "sk-OtterHD",
}
print(
"request: ",
{
"prompt": text_input,
"image": base64_image_str[:10],
},
)
url = "http://10.128.0.40:8890/app/otter"
headers = {"Content-Type": "application/json"}
response = requests.post(url, headers=headers, data=json.dumps(payload))
results = response.json()
print("response: ", {"result": results["result"]})
return results["result"]
title = """
# OTTER-HD: A High-Resolution Multi-modality Model
[[Otter Codebase]](https://github.com/Luodian/Otter) [[Paper]]() [[Checkpoints & Benchmarks]](https://huggingface.co/Otter-AI)
"""
css = """
#mkd {
height: 1000px;
overflow: auto;
border: 1px solid #ccc;
}
"""
if __name__ == "__main__":
with gr.Blocks(css=css) as demo:
gr.Markdown(title)
dialog_state = gr.State()
input_state = gr.State()
with gr.Tab("Ask a Question"):
with gr.Row(equal_height=True):
with gr.Column(scale=2):
image_input = gr.Image(label="Upload a High-Res Image", type="pil").style(height=600)
with gr.Column(scale=1):
vqa_output = gr.Textbox(label="Output").style(height=600)
text_input = gr.Textbox(label="Ask a Question")
vqa_btn = gr.Button("Send It")
gr.Examples(
[
[
"./assets/IMG_00095.png",
"How many camels are inside this image?",
],
[
"./assets/IMG_00095.png",
"How many people are inside this image?",
],
[
"./assets/IMG_00012.png",
"How many apples are there?",
],
# ["./assets/./IMG_00012.png", "How many apples are there? Count them row by row."],
[
"./assets/IMG_00080.png",
"What is this and where is it from?",
],
[
"./assets/IMG_00094.png",
"What's important on this website?",
],
],
inputs=[image_input, text_input],
outputs=[vqa_output],
fn=http_bot,
label="Click on any Examples below👇",
)
vqa_btn.click(fn=http_bot, inputs=[image_input, text_input], outputs=vqa_output)
demo.launch()