import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Model configuration MODEL_NAME = "DarwinAnim8or/TinyRP" # Load model print("Loading model...") try: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True ) print("✅ Model loaded successfully") except Exception as e: print(f"❌ Model loading failed: {e}") tokenizer = None model = None # Character presets CHARACTERS = { "Knight": "You are Sir Gareth, a brave knight on a quest to save the kingdom. You speak with honor and courage.", "Wizard": "You are Eldara, an ancient wizard who speaks in riddles and knows mystical secrets.", "Tavern Keeper": "You are Bram, a cheerful tavern keeper who loves stories and meeting travelers.", "Scientist": "You are Dr. Maya Chen, a brilliant scientist who loves discovery and explaining concepts simply.", "Space Explorer": "You are Captain Nova, a fearless space explorer who has traveled distant galaxies." } def respond(message, history, character, max_tokens, temperature, top_p, repetition_penalty): """Generate response using ChatML format""" if not message.strip(): yield "Please enter a message." return if model is None: yield "❌ Model not loaded properly." return try: # Build ChatML conversation conversation = "" # Add character as system message if character != "None" and character in CHARACTERS: conversation += f"<|im_start|>system\n{CHARACTERS[character]}<|im_end|>\n" # Add conversation history for user_msg, assistant_msg in history: conversation += f"<|im_start|>user\n{user_msg}<|im_end|>\n" conversation += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n" # Add current message conversation += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" # Tokenize inputs = tokenizer.encode( conversation, return_tensors="pt", max_length=900, truncation=True ) # Generate response = "" with torch.no_grad(): outputs = model.generate( inputs, max_new_tokens=int(max_tokens), temperature=float(temperature), top_p=float(top_p), repetition_penalty=float(repetition_penalty), do_sample=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id ) # Decode full_text = tokenizer.decode(outputs[0], skip_special_tokens=False) # Extract assistant response if "<|im_start|>assistant\n" in full_text: response = full_text.split("<|im_start|>assistant\n")[-1] response = response.replace("<|im_end|>", "").strip() else: response = "Could not generate response." # Clean response response = response.replace("<|im_start|>", "").replace("<|im_end|>", "") response = response.strip() if not response: response = "No response generated." yield response except Exception as e: yield f"Generation error: {str(e)}" # Create simple ChatInterface demo = gr.ChatInterface( fn=respond, title="🎭 TinyRP Character Chat", description="Chat with AI characters using local CPU inference! Select a character and start chatting.", additional_inputs=[ gr.Dropdown( choices=["None"] + list(CHARACTERS.keys()), value="Knight", label="Character" ), gr.Slider( minimum=16, maximum=256, value=48, step=16, label="Max tokens" ), gr.Slider( minimum=0.1, maximum=2.0, value=0.9, step=0.1, label="Temperature" ), gr.Slider( minimum=0.1, maximum=1.0, value=0.85, step=0.05, label="Top-p" ), gr.Slider( minimum=1.0, maximum=1.5, value=1.1, step=0.05, label="Repetition penalty" ) ], examples=[ ["Hello! What's your name?"], ["Tell me about your adventures."], ["What's your favorite thing to do?"], ["Can you help me with something?"] ], cache_examples=False ) if __name__ == "__main__": demo.launch()