import spaces
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread

@spaces.GPU
def predict(message, history):
    torch.set_default_device("cuda")
    
    # Load model and tokenizer
    model_id = "LiquidAI/LFM2-1.2B"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        load_in_4bit=True,  # Keeping 4-bit quantization for efficiency
        # attn_implementation="flash_attention_2"  # Uncomment on compatible GPU
    )
    
    # Format conversation history for chat template
    messages = [{"role": "user" if i % 2 == 0 else "assistant", "content": msg} 
                for conv in history for i, msg in enumerate(conv) if msg]
    messages.append({"role": "user", "content": message})
    
    # Apply chat template
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True
    ).to('cuda')
    
    # Setup streamer for real-time output
    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
    
    # Generation parameters
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.3,
        min_p=0.15,
        repetition_penalty=1.05,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Start generation in separate thread
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    
    # Stream tokens
    partial_message = ""
    for new_token in streamer:
        partial_message += new_token
        yield partial_message

# Setup Gradio interface
gr.ChatInterface(
    predict,
    description="""
    <center><h2>LiquidAI LFM2-1.2B Chat</h2></center>
    
    Chat with [LiquidAI/LFM2-1.2B](https://huggingface.co/LiquidAI/LFM2-1.2B), a compact and efficient language model.
    
    This model provides high-quality responses while maintaining a small footprint, making it ideal for fast inference.
    """,
    examples=[
        'Can you solve the equation 2x + 3 = 11 for x?',
        'What is C. elegans?',
        'Explain quantum computing in simple terms',
        'Write a Python function to find prime numbers',
        'What are the key differences between RNA and DNA?',
        'Can you write a haiku about artificial intelligence?'
    ],
    theme=gr.themes.Soft(primary_hue="blue"),
).launch()