Qwen-VL-Chat

Paused

App Files Files Community

Qwen-VL-Chat / app.py

Tonic

Update app.py

ea7c9d2 over 1 year ago

raw

history blame

5.75 kB

	import transformers
	from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
	import torch
	import gradio as gr
	import json
	import os
	import shutil
	import requests

	# Define the device
	device = "cuda" if torch.cuda.is_available() else "cpu"
	#Define variables
	temperature=0.4
	max_new_tokens=240
	top_p=0.92
	repetition_penalty=1.7
	max_length=2048

	model_name = "OpenLLM-France/Claire-7B-0.1"

	tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
	model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
	device_map="auto",
	torch_dtype=torch.bfloat16
	load_in_4bit=True # For efficient inference, if supported by the GPU card
	)
	model = model.to_bettertransformer()

	# Class to encapsulate the Falcon chatbot
	class FalconChatBot:
	def __init__(self, system_prompt="Le dialogue suivant est une conversation"):
	self.system_prompt = system_prompt

	def process_history(self, history):
	if history is None:
	return []

	# Ensure that history is a list of dictionaries
	if not isinstance(history, list):
	return []

	# Filter out special commands from the history
	filtered_history = []
	for message in history:
	if isinstance(message, dict):
	user_message = message.get("user", "")
	assistant_message = message.get("assistant", "")
	# Check if the user_message is not a special command
	if not user_message.startswith("Protagoniste:"):
	filtered_history.append({"user": user_message, "assistant": assistant_message})
	return filtered_history

	def predict(self, user_message, assistant_message, history, temperature=0.4, max_new_tokens=700, top_p=0.99, repetition_penalty=1.9):
	input_ids = input_ids.to(device)
	# Process the history to remove special commands
	processed_history = self.process_history(history)
	# Combine the user and assistant messages into a conversation
	conversation = f"{self.system_prompt}\n {assistant_message if assistant_message else ''}\n {user_message}\n "
	# Encode the conversation using the tokenizer
	input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=False)
	input_ids = input_ids.to(device)
	# Generate a response using the Falcon model
	response = model.generate(
	input_ids=input_ids,
	max_length=max_length,
	use_cache=False,
	early_stopping=False,
	bos_token_id=model.config.bos_token_id,
	eos_token_id=model.config.eos_token_id,
	pad_token_id=model.config.eos_token_id,
	temperature=temperature,
	do_sample=True,
	max_new_tokens=max_new_tokens,
	top_p=top_p,
	repetition_penalty=repetition_penalty
	) # Decode the generated response to text

	# Decode the generated response to text
	response_text = tokenizer.decode(response[0], skip_special_tokens=True)
	# Update and return the history with the new conversation
	updated_history = processed_history + [{"user": user_message, "assistant": response_text}]
	return response_text, updated_history


	# Create the Falcon chatbot instance
	falcon_bot = FalconChatBot()

	# Define the Gradio interface
	title = "👋🏻Bienvenue à Tonic's 🌜🌚Claire Chat !"
	description = "Vous pouvez utiliser [🌜🌚ClaireGPT](https://huggingface.co/OpenLLM-France/Claire-7B-0.1) Ou dupliquer pour l'uiliser localement ou sur huggingface! [Join me on Discord to build together](https://discord.gg/VqTxc76K3u)."
	history = [
	{"user": "Le dialogue suivant est une conversation entre Emmanuel Macron et Elon Musk:", "assistant": "Emmanuel Macron: Bonjour Monsieur Musk. Je vous remercie de me recevoir aujourd'hui."},]
	examples = [
	[
	{
	"user_message": "[Elon Musk:] - Bonjour Emmanuel. Enchanté de vous revoir.",
	"assistant_message": "[Emmanuel Macron:] - Je vois que vous avez effectué un voyage dans la région de la Gascogne.",
	"history": [],
	"temperature": 0.4,
	"max_new_tokens": 700,
	"top_p": 0.90,
	"repetition_penalty": 1.9,
	}
	]
	]

	additional_inputs=[
	gr.Textbox("", label="Introduisez Un Autre Personnage Ici ou Mettez En Scene"),
	gr.Slider(
	label="Temperature",
	value=0.9,
	minimum=0.0,
	maximum=1.0,
	step=0.05,
	interactive=True,
	info="Higher values produce more diverse outputs",
	),
	gr.Slider(
	label="Max new tokens",
	value=256,
	minimum=0,
	maximum=3000,
	step=64,
	interactive=True,
	info="The maximum numbers of new tokens",
	),
	gr.Slider(
	label="Top-p (nucleus sampling)",
	value=0.90,
	minimum=0.01,
	maximum=0.99,
	step=0.05,
	interactive=True,
	info="Higher values sample more low-probability tokens",
	),
	gr.Slider(
	label="Repetition penalty",
	value=1.2,
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	interactive=True,
	info="Penalize repeated tokens",
	)
	]

	iface = gr.Interface(
	fn=falcon_bot.predict,
	title=title,
	description=description,
	examples=examples,
	inputs=[
	gr.inputs.Textbox(label="Utilisez se format pour initier une conversation [Personage:]", type="text", lines=5),
	] + additional_inputs,
	outputs="text",
	theme="ParityError/Anime"
	)

	# Launch the Gradio interface for the Falcon model
	iface.launch()