import os import datetime import json import base64 from PIL import Image import gradio as gr import hashlib import requests import io # LOGDIR = "log" # logger = build_logger("otter", LOGDIR) # no_change_btn = gr.Button.update() # enable_btn = gr.Button.update(interactive=True) # disable_btn = gr.Button.update(interactive=False) def decode_image(encoded_image: str) -> Image: decoded_bytes = base64.b64decode(encoded_image.encode("utf-8")) buffer = io.BytesIO(decoded_bytes) image = Image.open(buffer) return image def encode_image(image: Image.Image, format: str = "PNG") -> str: with io.BytesIO() as buffer: image.save(buffer, format=format) encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8") return encoded_image def get_conv_log_filename(): t = datetime.datetime.now() name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json") return name def get_conv_image_dir(): name = os.path.join(LOGDIR, "images") os.makedirs(name, exist_ok=True) return name def get_image_name(image, image_dir=None): buffer = io.BytesIO() image.save(buffer, format="PNG") image_bytes = buffer.getvalue() md5 = hashlib.md5(image_bytes).hexdigest() if image_dir is not None: image_name = os.path.join(image_dir, md5 + ".png") else: image_name = md5 + ".png" return image_name def resize_image(image, max_size): width, height = image.size aspect_ratio = float(width) / float(height) if width > height: new_width = max_size new_height = int(new_width / aspect_ratio) else: new_height = max_size new_width = int(new_height * aspect_ratio) resized_image = image.resize((new_width, new_height)) return resized_image def http_bot(image_input, text_input, request: gr.Request): print(f"http_bot. ip: {request.client.host}") print(f"Prompt request: {text_input}") base64_image_str = encode_image(image_input) payload = { "content": [ { "prompt": text_input, "image": base64_image_str, } ], "token": "sk-OtterHD", } print( "request: ", { "prompt": text_input, "image": base64_image_str[:10], }, ) url = "https://rouge-surrey-katrina-signatures.trycloudflare.com/app/otter" headers = {"Content-Type": "application/json"} response = requests.post(url, headers=headers, data=json.dumps(payload)) results = response.json() print("response: ", {"result": results["result"]}) return results["result"] title = """ # OTTER-HD: A High-Resolution Multi-modality Model [[Otter Codebase]](https://github.com/Luodian/Otter) [[Paper]](https://arxiv.org/abs/2311.04219) [[Checkpoints & Benchmarks]](https://huggingface.co/Otter-AI) **OtterHD** is a multimodal fine-tuned from [Fuyu-8B](https://huggingface.co/adept/fuyu-8b) to facilitate a more fine-grained interpretation of high-resolution visual input *without a explicit vision encoder module*. All image patches are linear transformed and processed together with text tokens. This is a very innovative and elegant exploration. We are fascinated and paved in this way, we opensourced the finetune script for Fuyu-8B and improve training throughput by 4-5 times faster with [Flash-Attention-2](https://github.com/Dao-AILab/flash-attention). **Tips**: - Since high-res images are large that may cause the longer transmit time from HF Space to our backend server. Please be kinda patient for the response. - We are working on to finetune the model on LLaVA-1.5/LRV/LLaVAR data mixture and balance the detailed recognition and hallucination reduction. Stay tuned! - Please do not upload any NSFW images and ask relevant questions. We will ban the IP address if we found any inappropriate usage. """ css = """ #mkd { height: 1000px; overflow: auto; border: 1px solid #ccc; } """ if __name__ == "__main__": with gr.Blocks(css=css) as demo: gr.Markdown(title) dialog_state = gr.State() input_state = gr.State() with gr.Tab("Ask a Question"): with gr.Row(equal_height=True): with gr.Column(scale=2): image_input = gr.Image(label="Upload a High-Res Image", type="pil") with gr.Column(scale=1): vqa_output = gr.Textbox(label="Output") text_input = gr.Textbox(label="Ask a Question") vqa_btn = gr.Button("Send It") gr.Examples( [ [ "./assets/IMG_00095.png", "How many camels are inside this image?", ], [ "./assets/IMG_00057.png", "What's this image about?", ], [ "./assets/IMG_00040.png", "What are the scene texts in this image?", ], [ "./assets/./IMG_00012.png", "How many apples are there? Count them row by row.", ], [ "./assets/IMG_00080.png", "What is this and where is it from?", ], [ "./assets/IMG_00041.png", "What are the scene texts in this image?", ], ], inputs=[image_input, text_input], outputs=[vqa_output], fn=http_bot, label="Click on any Examples below👇", ) vqa_btn.click(fn=http_bot, inputs=[image_input, text_input], outputs=vqa_output) demo.launch()