File size: 5,751 Bytes
a1bd8b6
1874bf4
927b5de
 
1874bf4
 
 
 
927b5de
1874bf4
 
1baccf7
 
 
 
 
5cd12bf
927b5de
1aa5b50
927b5de
1aa5b50
 
 
ea7c9d2
1aa5b50
 
7c96374
927b5de
1874bf4
 
e1b8424
1874bf4
 
 
f71aa87
 
db61106
 
 
 
 
1874bf4
 
 
db61106
 
 
 
1aa5b50
db61106
1874bf4
 
3ea1758
7c96374
1874bf4
5ff99f2
1874bf4
7c96374
1874bf4
5ff99f2
ea7c9d2
1874bf4
7c96374
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8eb345d
5ff99f2
7c96374
 
 
1874bf4
 
 
 
 
 
1aa5b50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88d1760
927b5de
1aa5b50
927b5de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1874bf4
 
 
 
968fe0b
1874bf4
1aa5b50
1874bf4
 
edc6972
 
927b5de
1874bf4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
import torch
import gradio as gr
import json
import os
import shutil
import requests

# Define the device
device = "cuda" if torch.cuda.is_available() else "cpu"
#Define variables 
temperature=0.4
max_new_tokens=240
top_p=0.92
repetition_penalty=1.7
max_length=2048

model_name = "OpenLLM-France/Claire-7B-0.1"

tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16
    load_in_4bit=True                          # For efficient inference, if supported by the GPU card
)
model = model.to_bettertransformer()

# Class to encapsulate the Falcon chatbot
class FalconChatBot:
    def __init__(self, system_prompt="Le dialogue suivant est une conversation"):
        self.system_prompt = system_prompt

    def process_history(self, history):
        if history is None:
            return []
        
        # Ensure that history is a list of dictionaries
        if not isinstance(history, list):
            return []
        
        # Filter out special commands from the history
        filtered_history = []
        for message in history:
            if isinstance(message, dict):
                user_message = message.get("user", "")
                assistant_message = message.get("assistant", "")
                # Check if the user_message is not a special command
                if not user_message.startswith("Protagoniste:"):
                    filtered_history.append({"user": user_message, "assistant": assistant_message})
        return filtered_history

    def predict(self, user_message, assistant_message, history, temperature=0.4, max_new_tokens=700, top_p=0.99, repetition_penalty=1.9):
        input_ids = input_ids.to(device)
        # Process the history to remove special commands
        processed_history = self.process_history(history)
        # Combine the user and assistant messages into a conversation
        conversation = f"{self.system_prompt}\n {assistant_message if assistant_message else ''}\n {user_message}\n "
        # Encode the conversation using the tokenizer
        input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=False)
        input_ids = input_ids.to(device)
        # Generate a response using the Falcon model
        response = model.generate(
            input_ids=input_ids, 
            max_length=max_length, 
            use_cache=False, 
            early_stopping=False, 
            bos_token_id=model.config.bos_token_id, 
            eos_token_id=model.config.eos_token_id, 
            pad_token_id=model.config.eos_token_id, 
            temperature=temperature, 
            do_sample=True, 
            max_new_tokens=max_new_tokens, 
            top_p=top_p, 
            repetition_penalty=repetition_penalty
        )        # Decode the generated response to text

        # Decode the generated response to text
        response_text = tokenizer.decode(response[0], skip_special_tokens=True)
        # Update and return the history with the new conversation
        updated_history = processed_history + [{"user": user_message, "assistant": response_text}]
        return response_text, updated_history


# Create the Falcon chatbot instance
falcon_bot = FalconChatBot()

# Define the Gradio interface
title = "👋🏻Bienvenue à Tonic's 🌜🌚Claire Chat !"
description = "Vous pouvez utiliser [🌜🌚ClaireGPT](https://huggingface.co/OpenLLM-France/Claire-7B-0.1) Ou dupliquer pour l'uiliser localement ou sur huggingface! [Join me on Discord to build together](https://discord.gg/VqTxc76K3u)."
history = [
     {"user": "Le dialogue suivant est une conversation entre Emmanuel Macron et Elon Musk:", "assistant": "Emmanuel Macron: Bonjour Monsieur Musk. Je vous remercie de me recevoir aujourd'hui."},]
examples = [
    [
        {
             "user_message": "[Elon Musk:] - Bonjour Emmanuel. Enchanté de vous revoir.",
             "assistant_message": "[Emmanuel Macron:] - Je vois que vous avez effectué un voyage dans la région de la Gascogne.",
             "history": [],
             "temperature": 0.4,
            "max_new_tokens": 700,
            "top_p": 0.90,
            "repetition_penalty": 1.9,
       }
   ]
]

additional_inputs=[
    gr.Textbox("", label="Introduisez Un Autre Personnage Ici ou Mettez En Scene"),
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=256,
        minimum=0,
        maximum=3000,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.01,
        maximum=0.99,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )
]

iface = gr.Interface(
    fn=falcon_bot.predict,
    title=title,
    description=description,
    examples=examples,
    inputs=[
        gr.inputs.Textbox(label="Utilisez se format pour initier une conversation [Personage:]", type="text", lines=5),
    ] + additional_inputs,
    outputs="text",
    theme="ParityError/Anime"
)

# Launch the Gradio interface for the Falcon model
iface.launch()