|
import gradio as gr |
|
import spaces |
|
from huggingface_hub import login |
|
import accelerate |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer |
|
import os |
|
import torch |
|
from typing import Optional, Iterator, Dict, Any, List |
|
from threading import Thread |
|
from types import NoneType |
|
import traceback |
|
|
|
|
|
|
|
print(f"Is CUDA available: {torch.cuda.is_available()}") |
|
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") |
|
|
|
MAX_NEW_TOKENS = 2**13 |
|
DEFAULT_MAX_NEW_TOKENS = 0.65*MAX_NEW_TOKENS |
|
DEFAULT_SYSTEM_PROMPT = """ |
|
Tu es un expert en extraction de données dans des documents très longs et bruités. |
|
Tu comprends le sujet grâce à des liens sémantiques que tu peux extraire. |
|
Tu sers à créer des concepts hiérarchiques ainsi que des liens entre ceux-ci. |
|
Réponds de manière claire et formelle et va droit au but dans ta tâche. |
|
""" |
|
|
|
class HuggingFaceLogin: |
|
"""Handles authentication to the Hugging Face Hub using environment variables or explicit tokens.""" |
|
def __init__(self, env_token_key: str = "HF_TOKEN"): |
|
"""Initialize the login handler. |
|
|
|
Args: |
|
env_token_key (str): Environment variable key containing the token. Defaults to "HF_TOKEN". |
|
""" |
|
self.token = os.getenv(env_token_key) |
|
|
|
def login(self, token: str = None) -> bool: |
|
"""Authenticate with the Hugging Face Hub. |
|
|
|
Args: |
|
token (Optional[str]): Optional explicit token. If not provided, uses token from environment. |
|
|
|
Returns: |
|
bool: True if login successful, False otherwise. |
|
|
|
Raises: |
|
ValueError: If no token is available (neither in env nor passed explicitly). |
|
""" |
|
|
|
if not self.token: |
|
raise ValueError("No authentication token provided. Set HF_TOKEN environment variable or pass token explicitly.") |
|
try: |
|
print("Logging in to the Hugging Face Hub...") |
|
login(token=self.token) |
|
return True |
|
except Exception as e: |
|
print(f"Login failed: {str(e)}") |
|
return False |
|
|
|
model_config_4bit = BitsAndBytesConfig( |
|
load_in_4bit = True, |
|
bnb_4bit_use_double_quant = True, |
|
bnb_4bit_quant_type = "nf4", |
|
bnb_4bit_compute_dtype=torch.float16 |
|
) |
|
|
|
model_config_8bit = BitsAndBytesConfig( |
|
load_in_8bit=True, |
|
llm_int8_threshold=6.0, |
|
llm_int8_has_fp16_weight=False, |
|
bnb_8bit_compute_dtype=torch.float16 |
|
) |
|
|
|
if torch.cuda.is_available(): |
|
model_id = "meta-llama/Llama-3.1-8B-Instruct" |
|
model = AutoModelForCausalLM.from_pretrained(model_id, |
|
quantization_config=model_config_8bit, |
|
device_map="auto") |
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
|
|
def generate_llm_response( |
|
conversation: List[Dict[str, str]], |
|
max_new_tokens: int, |
|
temperature: float, |
|
top_p: float, |
|
top_k: int, |
|
repetition_penalty: float |
|
) -> str: |
|
"""Generate a response from the LLM based on the conversation.""" |
|
input_ids = tokenizer.apply_chat_template( |
|
conversation, |
|
return_tensors="pt", |
|
add_generation_prompt=True |
|
) |
|
|
|
input_ids = input_ids.to(model.device) |
|
|
|
streamer = TextIteratorStreamer( |
|
tokenizer, |
|
timeout=2*60.0, |
|
skip_prompt=True, |
|
skip_special_tokens=True |
|
) |
|
|
|
generate_kwargs = dict( |
|
{"input_ids": input_ids}, |
|
streamer=streamer, |
|
max_new_tokens=max_new_tokens, |
|
do_sample=True, |
|
top_p=top_p, |
|
top_k=top_k, |
|
temperature=temperature, |
|
num_beams=1, |
|
repetition_penalty=repetition_penalty, |
|
pad_token_id=tokenizer.eos_token_id, |
|
) |
|
|
|
t = Thread( |
|
target=model.generate, |
|
kwargs=generate_kwargs |
|
) |
|
t.start() |
|
|
|
|
|
accumulated_response = "" |
|
for text in streamer: |
|
accumulated_response += text |
|
yield accumulated_response |
|
|
|
def append_text_knowledge(file_path: str) -> str: |
|
""" |
|
Reads content from a selected file and returns it as a string. |
|
|
|
Args: |
|
file_path (str): Path to the selected file |
|
|
|
Returns: |
|
str: Content of the file or empty string if no file selected |
|
""" |
|
if file_path: |
|
try: |
|
with open(file_path, "r", encoding="utf-8") as f: |
|
return f.read() |
|
except Exception as e: |
|
print("Error reading file: {e}") |
|
return "" |
|
return "" |
|
|
|
knowledge_textbox = gr.Textbox( |
|
label="Knowledge Text", |
|
lines= 20, |
|
visible=False |
|
) |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Ontology Generation with Chain-of-Thought") |
|
chatbot = gr.Chatbot(type="messages") |
|
message_input = gr.Textbox( |
|
label="message", |
|
placeholder="Ask about the elicitation text...", |
|
lines=2, |
|
submit_btn=True |
|
) |
|
|
|
with gr.Row(): |
|
file_explorer = gr.FileExplorer( |
|
glob="**/*.txt", |
|
file_count="single", |
|
label="Upload file", |
|
show_label=True |
|
) |
|
knowledge_input = gr.Textbox( |
|
label="Knowledge text", |
|
lines=6, |
|
visible=True |
|
) |
|
with gr.Accordion("Advanced Settings", open=False): |
|
system_prompt_input = gr.Textbox( |
|
label="System Prompt", |
|
lines=4, |
|
value=DEFAULT_SYSTEM_PROMPT |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
max_tokens_slider = gr.Slider( |
|
label="Max new tokens", |
|
minimum=1, |
|
maximum=MAX_NEW_TOKENS, |
|
step=1, |
|
value=DEFAULT_MAX_NEW_TOKENS |
|
) |
|
temperature_slider = gr.Slider( |
|
label="Temperature", |
|
minimum=0.1, |
|
maximum=4.0, |
|
step=0.1, |
|
value=0.2 |
|
) |
|
|
|
with gr.Column(): |
|
top_p_slider = gr.Slider( |
|
label="Top-p (nucleus sampling)", |
|
minimum=0.05, |
|
maximum=1.0, |
|
step=0.05, |
|
value=0.8 |
|
) |
|
top_k_slider = gr.Slider( |
|
label="Top-k", |
|
minimum=1, |
|
maximum=1000, |
|
step=1, |
|
value=50 |
|
) |
|
repetition_penalty_slider = gr.Slider( |
|
label="Repetition penalty", |
|
minimum=1.0, |
|
maximum=2.0, |
|
step=0.05, |
|
value=1.0 |
|
) |
|
|
|
|
|
examples = gr.Examples( |
|
examples=[ |
|
["Extract meaningful entities in your knowledge document in order to create a Turtle-formatted output where you create classes and sub-classes and object properties automatically."], |
|
["Make a simple list of the classes, sub-classes and object properties that can be extracted from the knowledge document."] |
|
], |
|
inputs=message_input |
|
) |
|
|
|
def user_message(message:str, history:List[Dict[str, str]]): |
|
"""Add user message to chat history. |
|
|
|
Args: |
|
message (str): The User Message to send |
|
history (List[Dict[str,str]]): The previous chat conversation history. |
|
""" |
|
if message.strip() == "": |
|
return history, message |
|
|
|
history = history + [{"role":"user", "content": message}] |
|
return history, "" |
|
|
|
def bot_response(history, knowledge, system_prompt, max_tokens, temp, top_p, top_k, rep_penalty): |
|
"""Generate assistant response with visible thinking. |
|
|
|
Args: |
|
history (List[Dict[str, str]]): The previous chat conversation history |
|
knowledge (Any): Documents to pass as knowledge to the multimodal model |
|
system_prompt (str): System prompt that the model follows |
|
max_tokens (int): Max number of allowed output tokens |
|
temp (float): Model's Temperature |
|
top_p (int): Model's Top p value |
|
top_k (int): Model's Top k value |
|
rep_penalty (float): Model's repetition penalty |
|
|
|
Returns: |
|
history (List[Dict[str, str]]): The history of the conversation updated |
|
""" |
|
try: |
|
if not history or history[-1]["role"] != "user": |
|
return history |
|
|
|
user_message = history[-1]["content"] |
|
|
|
history.append({ |
|
"role": "assistant", |
|
"content": "Je réfléchis étape par étape...", |
|
"metadata": { |
|
"title": "Réflexion", |
|
"status": "pending" |
|
} |
|
}) |
|
yield history |
|
|
|
thinking_conversation = [] |
|
if system_prompt: |
|
thinking_conversation.append({"role": "system", "content": system_prompt}) |
|
if knowledge: |
|
thinking_conversation.append({ |
|
"role": "assistant", |
|
"content": f"Voici le document que je dois comprendre: {knowledge}\n\nJe vais l'analyser étape par étape." |
|
}) |
|
|
|
for msg in history[:-2]: |
|
thinking_conversation.append(msg) |
|
|
|
thinking_prompt = user_message + "\n\nRéfléchis étape par étape. D'abord identifie l'intention de l'utilisateur. Quand tu as compris ce qui t'est demandé, commence à établir un plan clair et précis que tu peux suivre. Utilise l'italic et le gras en Markdown pour séquencer et prioriser tes actions." |
|
thinking_conversation.append({"role": "user", "content": thinking_prompt}) |
|
|
|
|
|
for thinking_partial in generate_llm_response(thinking_conversation, |
|
max_new_tokens=max_tokens * 2, |
|
temperature=temp, |
|
top_p=top_p, |
|
top_k=top_k, |
|
repetition_penalty=rep_penalty): |
|
|
|
history[-1] = { |
|
"role": "assistant", |
|
"content": thinking_partial, |
|
"metadata": { |
|
"title": "Réflexion", |
|
"status": "done" |
|
} |
|
} |
|
yield history |
|
|
|
history[-1]["metadata"]["status"] = "done" |
|
yield history |
|
|
|
print("DEBUG:\t\tYielded history of ```thinking_result```") |
|
|
|
final_conversation = [] |
|
if system_prompt: |
|
final_conversation.append({"role": "system", "content": system_prompt}) |
|
if knowledge: |
|
final_conversation.append({ |
|
"role": "assistant", |
|
"content": f"J'ai analysé ce document: {knowledge}" |
|
}) |
|
|
|
for msg in history[:-1]: |
|
if "metadata" not in msg or "title" not in msg.get("metadata", {}): |
|
final_conversation.append(msg) |
|
|
|
final_conversation.append({ |
|
"role": "assistant", |
|
"content": f"Voici mon analyse étape par étape:\n{history[-1]['content']}\n\nMaintenant je vais formaliser le résultat final." |
|
}) |
|
final_conversation.append({ |
|
"role": "assistant", |
|
"content": "Je formule ma réponse finale..." |
|
}) |
|
yield history |
|
|
|
for final_partial in generate_llm_response(final_conversation, |
|
max_new_tokens=max_tokens, |
|
temperature=temp * 0.8, |
|
top_p=top_p, |
|
top_k=top_k, |
|
repetition_penalty=rep_penalty): |
|
history[-1]["content"] = final_partial |
|
yield history |
|
print("DEBUG:\t\tYielded history of ```final_answer```") |
|
|
|
except Exception as e: |
|
error_traceback = traceback.format_exc() |
|
print(f"Error traceback:\n{error_traceback}") |
|
|
|
history.append({ |
|
"role": "assistant", |
|
"content": f"An error occurred: {str(e)}\n\nTraceback details:\n{error_traceback}" |
|
}) |
|
yield history |
|
|
|
file_explorer.change( |
|
append_text_knowledge, |
|
file_explorer, |
|
knowledge_input |
|
) |
|
|
|
message_input.submit( |
|
user_message, |
|
inputs=[message_input, chatbot], |
|
outputs=[chatbot, message_input] |
|
).then( |
|
bot_response, |
|
inputs=[ |
|
chatbot, |
|
knowledge_input, |
|
system_prompt_input, |
|
max_tokens_slider, |
|
temperature_slider, |
|
top_p_slider, |
|
top_k_slider, |
|
repetition_penalty_slider |
|
], |
|
outputs=chatbot |
|
) |
|
if __name__ == "__main__": |
|
auth = HuggingFaceLogin() |
|
if auth.login(): |
|
print("Login successful!") |
|
demo.queue().launch() |
|
|