File size: 2,351 Bytes
a36f63d
08415b7
95fd627
358ed93
08415b7
95fd627
08415b7
 
358ed93
08415b7
a36f63d
 
08415b7
 
358ed93
08415b7
 
 
a36f63d
 
 
08415b7
a36f63d
 
95fd627
358ed93
 
95fd627
08415b7
a36f63d
 
 
08415b7
a36f63d
 
 
 
 
 
 
08415b7
 
95fd627
08415b7
 
 
 
 
 
 
 
 
 
 
 
 
a36f63d
08415b7
358ed93
95fd627
358ed93
08415b7
358ed93
 
 
95fd627
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import gradio as gr

# Load the custom model and tokenizer
model_path = 'redael/model_udc'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Check if CUDA is available and use GPU if possible, enable FP16 precision
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
if device.type == 'cuda':
    model = model.half()  # Use FP16 precision

def generate_response(prompt, model, tokenizer, max_length=100, num_beams=1, temperature=0.7, top_p=0.9, repetition_penalty=2.0):
    # Prepare the prompt
    prompt = f"User: {prompt}\nAssistant:"
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    outputs = model.generate(
        inputs['input_ids'],
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        num_beams=num_beams,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        early_stopping=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Post-processing to clean up the response
    response = response.split("Assistant:")[-1].strip()
    response_lines = response.split('\n')
    clean_response = []
    for line in response_lines:
        if "User:" not in line and "Assistant:" not in line:
            clean_response.append(line)
    response = ' '.join(clean_response)
    return response.strip()

def respond(message, history):
    # Prepare the prompt from the history and the new message
    system_message = "You are a friendly chatbot."
    conversation = system_message + "\n"
    for user_message, assistant_response in history:
        conversation += f"User: {user_message}\nAssistant: {assistant_response}\n"
    conversation += f"User: {message}\nAssistant:"

    # Fixed values for generation parameters
    max_tokens = 100  # Adjusted max tokens
    temperature = 0.7
    top_p = 0.9

    response = generate_response(conversation, model, tokenizer, max_length=max_tokens, temperature=temperature, top_p=top_p)
    
    return response

# Gradio Chat Interface
demo = gr.ChatInterface(
    respond
)

if __name__ == "__main__":
    demo.launch()