import torch import transformers import warnings import time import spaces import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from PIL import Image from threading import Thread transformers.logging.set_verbosity_error() transformers.logging.disable_progress_bar() warnings.filterwarnings("ignore") device = "cuda" # or cpu torch.set_default_device(device) model_name = "BAAI/Bunny-v1_1-Llama-3-8B-V" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # float32 for cpu device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True) @spaces.GPU def bot_streaming(message, history): print(message) if message["files"]: # message["files"][-1] is a Dict or just a string if type(message["files"][-1]) == dict: image_file = message["files"][-1]["path"] else: image_file = message["files"][-1] else: image_file = None # if there's no image uploaded for this turn, look for images in the past turns # kept inside tuples, take the last one for hist in history: if type(hist[0]) == tuple: image_file = hist[0][0] prompt = message["text"] if image_file is None: text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {prompt} ASSISTANT:" input_ids = torch.tensor(tokenizer(text).input_ids, dtype=torch.long).unsqueeze(0).to(device) else: text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \n{prompt} ASSISTANT:" text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("")] input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0).to(device) if image_file is not None: image = Image.open(image_file) image_tensor = model.process_images([image], model.config).to(dtype=model.dtype, device=device) else: image_tensor = None streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15) thread = Thread(target=model.generate, kwargs=dict( inputs=input_ids, images=image_tensor, do_sample=True, temperature=0.2, top_p=0.7, max_new_tokens=512, streamer=streamer, use_cache=True, repetition_penalty=1.08 )) thread.start() buffer = "" time.sleep(0.5) for new_text in streamer: if "<|end_of_text|>" in new_text: new_text = new_text.split("<|end_of_text|>")[0] buffer += new_text # generated_text_without_prompt = buffer[len(text_prompt):] generated_text_without_prompt = buffer # print(generated_text_without_prompt) time.sleep(0.06) # print(f"new_text: {generated_text_without_prompt}") yield generated_text_without_prompt title_markdown = (""" # 🐰 Bunny: A family of lightweight multimodal models [📖 [Technical report](https://arxiv.org/abs/2402.11530)] | [🏠 [Code](https://github.com/BAAI-DCAI/Bunny)] | [🤗 [Bunny-v1.1-Llama-3-8B-V](https://huggingface.co/BAAI/Bunny-v1_1-Llama-3-8B-V)] | [🤗 [Bunny-v1.1-4B](https://huggingface.co/BAAI/Bunny-v1_1-4B)] | [🤗 [Bunny-v1.0-3B](https://huggingface.co/BAAI/Bunny-v1_0-3B)] """) chatbot = gr.Chatbot( elem_id="chatbot", label="Bunny-v1.1-Llama-3-8B-V", avatar_images=[f"./assets/user.png", f"./assets/icon.jpg"], height=550 ) chat_input = gr.MultimodalTextbox( interactive=True, file_types=["image"], placeholder="Enter message or upload file...", show_label=False ) with gr.Blocks(fill_height=True) as demo: gr.Markdown(title_markdown) gr.ChatInterface( fn=bot_streaming, stop_btn="Stop Generation", multimodal=True, textbox=chat_input, chatbot=chatbot ) gr.Examples(examples=[{"text": "What is the astronaut holding in his hand?", "files": ["./assets/example_1.png"]}, {"text": "Why is the image funny?", "files": ["./assets/example_2.png"]}], inputs=chat_input) demo.queue(api_open=False) demo.launch(show_api=False, share=False)