File size: 3,316 Bytes
6f3f07e
3f2900f
8bd462e
3f2900f
8bd462e
2d26215
3f2900f
cf082dc
b87f04a
ded7267
eb1daf1
b87f04a
b6f59a7
6615fb0
b87f04a
6d09328
b87f04a
 
a184d8d
b87f04a
 
 
6615fb0
 
 
56d0515
6615fb0
 
b87f04a
6615fb0
 
 
b6f59a7
6615fb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bd462e
6615fb0
 
 
b87f04a
6615fb0
b87f04a
6615fb0
b87f04a
6615fb0
b87f04a
6d09328
bfe628d
b87f04a
a184d8d
634313c
b011b3b
3f2900f
80f2b5c
acc3ae4
3f2900f
6615fb0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import spaces
import gradio as gr
from transformers import pipeline, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread
import os

@spaces.GPU()
def load_model(model_name):
    return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
@spaces.GPU(duration=45)
def generate(
    message,
    history,
    model_name,
    system,
    temperature=0.4,
    top_p=0.95,
    min_p=0.1,
    top_k=50,
    max_new_tokens=256,
):
    try:
        pipe = load_model(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
        tokenizer.eos_token = "<|im_end|>"
        print(tokenizer)
        pipe.tokenizer = tokenizer

        prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
        for (user_turn, assistant_turn) in history:
            prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
        prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

        streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
        generation_kwargs = dict(
            text_inputs=prompt, 
            streamer=streamer, 
            max_new_tokens=max_new_tokens, 
            do_sample=True, 
            top_p=top_p, 
            min_p=min_p, 
            top_k=top_k, 
            temperature=temperature, 
            num_beams=1, 
            repetition_penalty=1.1
        )
        
        t = Thread(target=pipe.__call__, kwargs=generation_kwargs)
        t.start()

        outputs = []
        for chunk in streamer:
            outputs.append(chunk)
            yield "".join(outputs)
    except StopAsyncIteration:
        print("Stream stopped unexpectedly.")
        yield "".join(outputs)
    except Exception as e:
        print(f"An error occurred: {e}")
        yield "An error occurred during generation."

model_choices = ["Locutusque/Apollo-2.0-Llama-3.1-8B", "Locutusque/Llama-3-NeuralHermes-Pro-8B", "Locutusque/Hercules-5.0-Qwen2-7B", "Locutusque/Llama-3-NeuralHercules-5.0-8B", "Locutusque/Hercules-5.0-Index-1.9B", "Locutusque/Llama-3-Hercules-5.0-8B"]
# What at the best options? 
g = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
        gr.components.Textbox(lines=2, label="System Prompt", value="You are an AI."),
        gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
        gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
        gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
        gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
        gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),  
    ],
    title="Locutusque's Language Models",
    description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
)
if __name__ == "__main__":
    g.launch()