import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer

# Load the local model
model_name = "codewithdark/latent-recurrent-depth-lm"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()  # Set to evaluation mode

# Define inference function
def chat_with_model(input_text, model_choice):
    if model_choice == "Latent Recurrent Depth LM":
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
        with torch.no_grad():
            output = model.generate(input_ids, max_length=512)
        response = tokenizer.decode(output[0], skip_special_tokens=True)
        return response
    return "Model not available yet!"

# Create Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# 🤖 Chat with Latent Recurrent Depth LM")
    
    model_choice = gr.Radio(
        ["Latent Recurrent Depth LM"],  # Add more models if needed
        label="Select Model",
        value="Latent Recurrent Depth LM"
    )
    
    text_input = gr.Textbox(label="Enter your message")
    submit_button = gr.Button("Generate Response")
    output_text = gr.Textbox(label="Model Response")

    submit_button.click(fn=chat_with_model, inputs=[text_input, model_choice], outputs=output_text)

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch()