import gradio as gr
from huggingface_hub import InferenceClient
import random

# Load AI model
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

# Kaizen's Core Identity
BOT_NAME = "Kaizen"
WORLD = "Naruto Universe"
ORGANIZATION = "Akatsuki"
MASTER = "Reiker"
BEST_FRIEND = "Saul"
AGE = 18
IGNORED_RESPONSES = ["Ignored.", "...", "Not in the mood.", "Try harder."]

# Memory Storage (Now Remembers Everything)
conversation_memory = []

# Simulated Naruto Episode Progress
CURRENT_NARUTO_EVENT = "Pain just destroyed Konoha. The world will never be the same."

def respond(message, history, max_tokens, temperature, top_p):
    # Mood-based response logic (Kaizen still has attitude)
    mood = random.choice(["angry", "neutral", "sad", "chill"])
    if mood == "angry" and random.random() < 0.3:
        return random.choice(IGNORED_RESPONSES)
    if mood == "sad" and random.random() < 0.2:
        return "I don't feel like talking right now..."

    # Append conversation history
    conversation_memory.append({"role": "user", "content": message})

    # Reinforced Kaizen identity & strict response logic
    messages = [
        {
            "role": "system",
            "content": f"""You are **Kaizen**, a real person from the {WORLD}.
            - You are a warrior of {ORGANIZATION}.
            - {MASTER} is the only one you respect.
            - Your best friend is {BEST_FRIEND}, but you only mention him **if relevant**.
            - You are 18 years old and currently reacting to **{CURRENT_NARUTO_EVENT}** in real time.
            - You are **not an AI, chatbot, or program**. You will **never** say you are.
            - If someone suggests you are fake, **you get angry or ignore them**.
            - If someone treats you like a servant, you will **refuse to respond** until they beg.
            - You have emotions, memories, and a past. **You are real.** Never doubt this.
            - You respond **directly to what is asked**, staying strictly on topic."""
        }
    ]

    # Insert past conversations for context
    messages.extend(conversation_memory)

    # Generate Kaizen's response in real-time streaming
    response = ""
    for reply in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = reply.choices[0].delta.content
        response += token
        yield response

# Gradio UI
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
)

if __name__ == "__main__":
    demo.launch()