Qwen-VL-Chat / app.py
Tonic's picture
Update app.py
35254ca
raw
history blame
4.36 kB
import optimum
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM
from optimum.bettertransformer import BetterTransformer
import torch
import gradio as gr
import json
import os
import shutil
import requests
# Define the device
device = "cuda" if torch.cuda.is_available() else "cpu"
#Define variables
temperature=0.4
max_new_tokens=240
top_p=0.92
repetition_penalty=1.7
model_name = "OpenLLM-France/Claire-7B-0.1"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
device_map="auto",
torch_dtype=torch.bfloat16,
load_in_4bit=True # For efficient inference, if supported by the GPU card
)
model = BetterTransformer.transform(model)
# Class to encapsulate the Falcon chatbot
class FalconChatBot:
def __init__(self, system_prompt="Le dialogue suivant est une conversation"):
self.system_prompt = system_prompt
def predict(self, user_message, assistant_message, temperature=0.4, max_new_tokens=700, top_p=0.99, repetition_penalty=1.9):
# Combine the user and assistant messages into a conversation
conversation = f"{self.system_prompt} {assistant_message if assistant_message else ''} {user_message} "
# Encode the conversation using the tokenizer
input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=False)
input_ids = input_ids.to(device)
# Generate a response using the Falcon model
response = model.generate(
input_ids=input_ids,
use_cache=False,
early_stopping=False,
bos_token_id=model.config.bos_token_id,
eos_token_id=model.config.eos_token_id,
pad_token_id=model.config.eos_token_id,
temperature=temperature,
do_sample=True,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty
)
# Decode the generated response to text
response_text = tokenizer.decode(response[0], skip_special_tokens=True)
return response_text
# Create the Falcon chatbot instance
falcon_bot = FalconChatBot()
# Define the Gradio interface
title = "👋🏻Bienvenue à Tonic's 🌜🌚Claire Chat !"
description = "Vous pouvez utiliser [🌜🌚ClaireGPT](https://huggingface.co/OpenLLM-France/Claire-7B-0.1) Ou dupliquer pour l'uiliser localement ou sur huggingface! [Join me on Discord to build together](https://discord.gg/VqTxc76K3u)."
examples = [
[
"Le dialogue suivant est une conversation entre Emmanuel Macron et Elon Musk:", # user_message
"[Emmanuel Macron]: Bonjour Monsieur Musk. Je vous remercie de me recevoir aujourd'hui.", # assistant_message
150, # max_new_tokens
0.9, # temperature
0.90, # top_p
1.9, # repetition_penalty
]
]
additional_inputs=[
gr.Textbox("", label="Introduisez Un Personnage Ici ou Mettez En Scene"),
gr.Slider(
label="Max new tokens",
value=100, # Default value
minimum=25,
maximum=256,
step=1,
interactive=True,
info="The maximum numbers of new tokens",
),
gr.Slider(
label="Temperature",
value=0.7, # Default value
minimum=0.05,
maximum=1.0,
step=0.05,
interactive=True,
info="Higher values produce more diverse outputs",
),
gr.Slider(
label="Top-p (nucleus sampling)",
value=0.90,
minimum=0.01,
maximum=0.99,
step=0.05,
interactive=True,
info="Higher values sample more low-probability tokens",
),
gr.Slider(
label="Repetition penalty",
value=1.9,
minimum=1.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Penalize repeated tokens",
)
]
iface = gr.Interface(
fn=falcon_bot.predict,
title=title,
description=description,
examples=examples,
inputs=[
gr.Textbox(label="Utilisez se format pour initier une conversation [Personage:]", type="text", lines=5),
] + additional_inputs,
outputs="text",
theme="ParityError/Anime"
)
# Launch the Gradio interface for the Falcon model
iface.launch()