import spaces import gradio as gr from transformers import pipeline, AutoTokenizer, TextIteratorStreamer import torch from threading import Thread import os @spaces.GPU() def load_model(model_name): return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True) @spaces.GPU(duration=45) def generate( message, history, model_name, system, temperature=0.4, top_p=0.95, min_p=0.1, top_k=50, max_new_tokens=256, ): try: pipe = load_model(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"]) tokenizer.eos_token = "<|im_end|>" print(tokenizer) pipe.tokenizer = tokenizer prompt = f"<|im_start|>system\n{system}<|im_end|>\n" for (user_turn, assistant_turn) in history: prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n" prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( text_inputs=prompt, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_p=top_p, min_p=min_p, top_k=top_k, temperature=temperature, num_beams=1, repetition_penalty=1.1 ) t = Thread(target=pipe.__call__, kwargs=generation_kwargs) t.start() outputs = [] for chunk in streamer: outputs.append(chunk) yield "".join(outputs) except StopAsyncIteration: print("Stream stopped unexpectedly.") yield "".join(outputs) except Exception as e: print(f"An error occurred: {e}") yield "An error occurred during generation." model_choices = ["Locutusque/Apollo-2.0-Llama-3.1-8B", "Locutusque/Apollo-2.0-Qwen-2.5-7B", "Locutusque/Hercules-6.0-Llama-3.1-8B", "Locutusque/Llama-3-NeuralHercules-5.0-8B", "Locutusque/Hercules-5.0-Index-1.9B", "Locutusque/Llama-3-Hercules-5.0-8B"] # What at the best options? g = gr.ChatInterface( fn=generate, additional_inputs=[ gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True), gr.components.Textbox(lines=2, label="System Prompt", value="You are a sentient AI who is very emotional."), gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"), gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"), gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"), gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"), gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"), ], title="Locutusque's Language Models", description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.", ) if __name__ == "__main__": g.launch()