import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download def load_model(): repo_id = "forestav/gguf_lora_model" model_file = "unsloth.F16.gguf" local_path = hf_hub_download( repo_id=repo_id, filename=model_file ) print(f"Loading model from: {local_path}") model = Llama( model_path=local_path, n_ctx=2048, n_threads=8 ) return model def generate_response(message, history): response = model.create_chat_completion( messages=[ {"role": "user", "content": message} ], max_tokens=512, temperature=0.7, top_p=0.95, ) return response['choices'][0]['message']['content'] # Load model globally print("Starting model loading...") model = load_model() print("Model loaded successfully!") # Create Gradio interface demo = gr.ChatInterface( fn=generate_response, title="Your GGUF Model Chat", description="A conversational AI model using GGUF format", examples=["Continue the fibonacci sequence: 1, 1, 2, 3, 5, 8,"] ) # Add proper Gradio launch configuration for Spaces demo.launch( server_name="0.0.0.0", # Necessary for Spaces server_port=7860, # Standard port for Spaces share=False # Don't need share link in Spaces )