import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

def load_model():
    repo_id = "forestav/gguf_lora_model"
    model_file = "unsloth.F16.gguf"  
    
    local_path = hf_hub_download(
        repo_id=repo_id,
        filename=model_file
    )
    
    print(f"Loading model from: {local_path}")
    
    model = Llama(
        model_path=local_path,
        n_ctx=2048,
        n_threads=8
    )
    
    return model

def generate_response(message, history):
    response = model.create_chat_completion(
        messages=[
            {"role": "user", "content": message}
        ],
        max_tokens=512,
        temperature=0.7,
        top_p=0.95,
    )
    
    return response['choices'][0]['message']['content']

# Load model globally
print("Starting model loading...")
model = load_model()
print("Model loaded successfully!")

# Create Gradio interface
demo = gr.ChatInterface(
    fn=generate_response,
    title="Your GGUF Model Chat",
    description="A conversational AI model using GGUF format",
    examples=["Continue the fibonacci sequence: 1, 1, 2, 3, 5, 8,"]
)

# Add proper Gradio launch configuration for Spaces
demo.launch(
    server_name="0.0.0.0",  # Necessary for Spaces
    server_port=7860,       # Standard port for Spaces
    share=False            # Don't need share link in Spaces
)