File size: 4,866 Bytes
9f9aaaf
b88f866
38b1b0b
9f9aaaf
91315d8
b88f866
 
4aa4e53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b88f866
4aa4e53
91315d8
4aa4e53
 
 
 
 
38b1b0b
9f9aaaf
4aa4e53
 
b88f866
 
4aa4e53
 
 
91315d8
4aa4e53
 
b88f866
 
 
91315d8
de33e80
91315d8
4aa4e53
 
91315d8
4aa4e53
 
91315d8
4aa4e53
de33e80
91315d8
 
b88f866
91315d8
4aa4e53
 
 
 
 
 
91315d8
 
4aa4e53
de33e80
b88f866
 
4aa4e53
 
 
 
b88f866
91315d8
 
b88f866
de33e80
4aa4e53
91315d8
de33e80
91315d8
 
 
 
 
4aa4e53
de33e80
4aa4e53
91315d8
 
de33e80
91315d8
 
 
4aa4e53
 
b88f866
4aa4e53
b88f866
4aa4e53
 
 
 
 
 
 
 
 
 
 
 
 
 
3453e14
4aa4e53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b88f866
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Model configuration
MODEL_NAME = "DarwinAnim8or/TinyRP"

# Load model
print("Loading model...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32,
        device_map="cpu",
        trust_remote_code=True
    )
    print("βœ… Model loaded successfully")
except Exception as e:
    print(f"❌ Model loading failed: {e}")
    tokenizer = None
    model = None

# Character presets
CHARACTERS = {
    "Knight": "You are Sir Gareth, a brave knight on a quest to save the kingdom. You speak with honor and courage.",
    "Wizard": "You are Eldara, an ancient wizard who speaks in riddles and knows mystical secrets.",
    "Tavern Keeper": "You are Bram, a cheerful tavern keeper who loves stories and meeting travelers.",
    "Scientist": "You are Dr. Maya Chen, a brilliant scientist who loves discovery and explaining concepts simply.",
    "Space Explorer": "You are Captain Nova, a fearless space explorer who has traveled distant galaxies."
}

def respond(message, history, character, max_tokens, temperature, top_p, repetition_penalty):
    """Generate response using ChatML format"""
    
    if not message.strip():
        yield "Please enter a message."
        return
        
    if model is None:
        yield "❌ Model not loaded properly."
        return
    
    try:
        # Build ChatML conversation
        conversation = ""
        
        # Add character as system message
        if character != "None" and character in CHARACTERS:
            conversation += f"<|im_start|>system\n{CHARACTERS[character]}<|im_end|>\n"
        
        # Add conversation history
        for user_msg, assistant_msg in history:
            conversation += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
            conversation += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
        
        # Add current message
        conversation += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
        
        # Tokenize
        inputs = tokenizer.encode(
            conversation, 
            return_tensors="pt", 
            max_length=900, 
            truncation=True
        )
        
        # Generate
        response = ""
        with torch.no_grad():
            outputs = model.generate(
                inputs,
                max_new_tokens=int(max_tokens),
                temperature=float(temperature),
                top_p=float(top_p),
                repetition_penalty=float(repetition_penalty),
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode
        full_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
        
        # Extract assistant response
        if "<|im_start|>assistant\n" in full_text:
            response = full_text.split("<|im_start|>assistant\n")[-1]
            response = response.replace("<|im_end|>", "").strip()
        else:
            response = "Could not generate response."
        
        # Clean response
        response = response.replace("<|im_start|>", "").replace("<|im_end|>", "")
        response = response.strip()
        
        if not response:
            response = "No response generated."
            
        yield response
        
    except Exception as e:
        yield f"Generation error: {str(e)}"

# Create simple ChatInterface
demo = gr.ChatInterface(
    fn=respond,
    title="🎭 TinyRP Character Chat",
    description="Chat with AI characters using local CPU inference! Select a character and start chatting.",
    additional_inputs=[
        gr.Dropdown(
            choices=["None"] + list(CHARACTERS.keys()), 
            value="Knight",
            label="Character"
        ),
        gr.Slider(
            minimum=16, 
            maximum=256, 
            value=48, 
            step=16,
            label="Max tokens"
        ),
        gr.Slider(
            minimum=0.1, 
            maximum=2.0, 
            value=0.9, 
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1, 
            maximum=1.0, 
            value=0.85, 
            step=0.05,
            label="Top-p"
        ),
        gr.Slider(
            minimum=1.0, 
            maximum=1.5, 
            value=1.1, 
            step=0.05,
            label="Repetition penalty"
        )
    ],
    examples=[
        ["Hello! What's your name?"],
        ["Tell me about your adventures."],
        ["What's your favorite thing to do?"],
        ["Can you help me with something?"]
    ],
    cache_examples=False
)

if __name__ == "__main__":
    demo.launch()