import gradio as gr
from transformers import AutoTokenizer, pipeline
from auto_gptq import AutoGPTQForCausalLM
import torch
import os

# Model loading with memory optimization
MODEL_NAME = "TheBloke/deepseek-coder-1.3b-instruct-GPTQ"
cache_dir = "./model_cache"
os.makedirs(cache_dir, exist_ok=True)

# Load tokenizer and model with 4-bit quantization
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, cache_dir=cache_dir)
model = AutoGPTQForCausalLM.from_quantized(
    MODEL_NAME,
    model_basename="model",
    use_safetensors=True,
    quantize_config=None,
    device_map="auto",
    low_cpu_mem_usage=True,
    cache_dir=cache_dir
)

# Create generation pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

def generate_text(prompt, max_length=512, temperature=0.7):
    """Generate text with safety checks and context awareness"""
    full_prompt = f"Instruct: {prompt}\nOutput:"
    with torch.inference_mode():
        response = generator(
            full_prompt,
            max_new_tokens=max_length,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )[0]["generated_text"]
    # Remove prompt from output
    return response.split("Output:")[-1].strip()

# Gradio interface with enhanced UX
with gr.Blocks(theme="soft", css=".gradio-container {max-width: 800px; margin: auto;}") as demo:
    gr.Markdown("""
    # 🧠 DeepSeek Coder 1.3B Instruct (GPTQ)  
    *Text-to-Code Generation App*  
    Enter a programming instruction below and adjust parameters for optimal output.
    """)
    
    with gr.Row():
        prompt = gr.Textbox(
            label="Enter your instruction",
            placeholder="Write a Python function to calculate Fibonacci numbers...",
            lines=4
        )
    
    with gr.Row():
        max_length = gr.Slider(64, 2048, value=512, label="Max Output Length")
        temperature = gr.Slider(0.1, 1.5, value=0.7, label="Creativity (Temperature)")
    
    output = gr.Textbox(label="Generated Output", lines=10)
    submit = gr.Button("✨ Generate Code", variant="primary")
    
    submit.click(
        fn=generate_text,
        inputs=[prompt, max_length, temperature],
        outputs=output
    )
    
    gr.Markdown("""
    ### ℹ️ How it works
    - Uses GPTQ-quantized model for efficient inference
    - Automatically handles context window management
    - Temperature controls randomness (0.1=strict, 1.5=creative)
    """)

demo.launch()