File size: 2,799 Bytes
d88613b
492ffb8
651361d
7a96b3a
d88613b
492ffb8
 
 
 
d88613b
b698bb4
8a33d58
b698bb4
8a33d58
7a96b3a
d88613b
 
 
 
 
 
 
 
492ffb8
651361d
492ffb8
d88613b
 
651361d
d88613b
651361d
 
 
d88613b
7a96b3a
651361d
7a96b3a
 
 
d88613b
492ffb8
7a96b3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d88613b
492ffb8
 
d88613b
492ffb8
 
d88613b
7a96b3a
 
492ffb8
 
d88613b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492ffb8
d88613b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import time

# Load model and tokenizer from Hugging Face Hub
model_name = "Electricarchmage/cookbookgpt"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the pad_token to eos_token and padding_side to 'left'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# Define the respond function with logging for debugging
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    # Preparing the messages for context (the history and the new message)
    messages = [{"role": "system", "content": system_message}]
    
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    
    messages.append({"role": "user", "content": message})

    # Tokenize the input
    inputs = tokenizer([msg["content"] for msg in messages], return_tensors="pt", padding=True, truncation=True)
    attention_mask = inputs.get('attention_mask', torch.ones_like(inputs['input_ids']))

    start_time = time.time()  # Start the timer

    # Generate output tokens
    try:
        output = model.generate(
            inputs["input_ids"],
            attention_mask=attention_mask,
            max_length=max_tokens + len(inputs["input_ids"][0]),
            temperature=temperature,
            top_p=top_p,
            num_return_sequences=1,
            do_sample=True,
            no_repeat_ngram_size=2,
        )
    except Exception as e:
        return f"Error during generation: {str(e)}"

    generation_time = time.time() - start_time  # Time taken for generation

    # Decode the output tokens into text
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract only the assistant's reply
    assistant_reply = response.split("Assistant:")[-1].strip()

    # Add generation time in the response for debugging
    return f"Response: {assistant_reply}\nGeneration time: {generation_time:.2f} seconds"

# Define the Gradio interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

# Launch the app
if __name__ == "__main__":
    demo.launch()