import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Model configuration
MODEL_NAME = "DarwinAnim8or/TinyRP"

# Load model
print("Loading model...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32,
        device_map="cpu",
        trust_remote_code=True
    )
    print("✅ Model loaded successfully")
except Exception as e:
    print(f"❌ Model loading failed: {e}")
    tokenizer = None
    model = None

# Character presets
CHARACTERS = {
    "Knight": "You are Sir Gareth, a brave knight on a quest to save the kingdom. You speak with honor and courage.",
    "Wizard": "You are Eldara, an ancient wizard who speaks in riddles and knows mystical secrets.",
    "Tavern Keeper": "You are Bram, a cheerful tavern keeper who loves stories and meeting travelers.",
    "Scientist": "You are Dr. Maya Chen, a brilliant scientist who loves discovery and explaining concepts simply.",
    "Space Explorer": "You are Captain Nova, a fearless space explorer who has traveled distant galaxies."
}

def respond(message, history, character, max_tokens, temperature, top_p, repetition_penalty):
    """Generate response using ChatML format"""
    
    if not message.strip():
        yield "Please enter a message."
        return
        
    if model is None:
        yield "❌ Model not loaded properly."
        return
    
    try:
        # Build ChatML conversation
        conversation = ""
        
        # Add character as system message
        if character != "None" and character in CHARACTERS:
            conversation += f"<|im_start|>system\n{CHARACTERS[character]}<|im_end|>\n"
        
        # Add conversation history
        for user_msg, assistant_msg in history:
            conversation += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
            conversation += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
        
        # Add current message
        conversation += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
        
        # Tokenize
        inputs = tokenizer.encode(
            conversation, 
            return_tensors="pt", 
            max_length=900, 
            truncation=True
        )
        
        # Generate
        response = ""
        with torch.no_grad():
            outputs = model.generate(
                inputs,
                max_new_tokens=int(max_tokens),
                temperature=float(temperature),
                top_p=float(top_p),
                repetition_penalty=float(repetition_penalty),
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode
        full_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
        
        # Extract assistant response
        if "<|im_start|>assistant\n" in full_text:
            response = full_text.split("<|im_start|>assistant\n")[-1]
            response = response.replace("<|im_end|>", "").strip()
        else:
            response = "Could not generate response."
        
        # Clean response
        response = response.replace("<|im_start|>", "").replace("<|im_end|>", "")
        response = response.strip()
        
        if not response:
            response = "No response generated."
            
        yield response
        
    except Exception as e:
        yield f"Generation error: {str(e)}"

# Create simple ChatInterface
demo = gr.ChatInterface(
    fn=respond,
    title="🎭 TinyRP Character Chat",
    description="Chat with AI characters using local CPU inference! Select a character and start chatting.",
    additional_inputs=[
        gr.Dropdown(
            choices=["None"] + list(CHARACTERS.keys()), 
            value="Knight",
            label="Character"
        ),
        gr.Slider(
            minimum=16, 
            maximum=256, 
            value=48, 
            step=16,
            label="Max tokens"
        ),
        gr.Slider(
            minimum=0.1, 
            maximum=2.0, 
            value=0.9, 
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1, 
            maximum=1.0, 
            value=0.85, 
            step=0.05,
            label="Top-p"
        ),
        gr.Slider(
            minimum=1.0, 
            maximum=1.5, 
            value=1.1, 
            step=0.05,
            label="Repetition penalty"
        )
    ],
    examples=[
        ["Hello! What's your name?"],
        ["Tell me about your adventures."],
        ["What's your favorite thing to do?"],
        ["Can you help me with something?"]
    ],
    cache_examples=False
)

if __name__ == "__main__":
    demo.launch()