import gradio as gr  
from llama_cpp import Llama  
  
# Load the Llama model  
llm = Llama.from_pretrained(  
    repo_id="GSridhar1982/QA_Llama31_Quantized_GGUF",  
    filename="QA_llama31_unsloth.Q4_K_M.gguf",  
)  
  
def generate_response(user_input):  
    # Perform inference  
    response = llm.create_chat_completion(  
        messages=[  
            {  
                "role": "user",  
                "content": user_input  
            }  
        ]  
    )  
      
    # Extract the model's reply  
    model_reply = response['choices'][0]['message']['content']  
    return model_reply  
  
# Create a Gradio interface  
iface = gr.Interface(  
    fn=generate_response,  
    inputs=gr.inputs.Textbox(lines=2, placeholder="Enter your question here..."),  
    outputs="text",  
    title="AIML Q&A Chatbot",  
    description="Ask questions related to AIML and get answers from the fine-tuned Llama model."  
)  
  
# Launch the app  
iface.launch()