import llama_cpp
from llama_cpp import Llama
# import llama_cpp.llama_tokenizer
import gradio as gr

from huggingface_hub import hf_hub_download

model_name = "large-traversaal/Alif-1.0-8B-Instruct"
model_file = "model-Q8_0.gguf"
model_path_file = hf_hub_download(model_name,
                             filename=model_file,)


llama = Llama(
    model_path=model_path_file,
    n_gpu_layers=40,  # Adjust based on VRAM
    n_threads=8,  # Match CPU cores
    n_batch=512,  # Optimize for better VRAM usage
    n_ctx=4096,  # Context window size
    verbose=True  # Enable debug logging
)

chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:"""

# Function to generate text with streaming output
def chat_with_ai(prompt):
    query = chat_prompt.format(inp=prompt)
    
    #response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True)  # Enable streaming
    response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True)  # Enable streaming

    text = ""
    for chunk in response:
        content = chunk["choices"][0]["text"]
        if content:
            text += content
            yield text


# Gradio UI setup
demo = gr.Interface(
    fn=chat_with_ai,  # Streaming function
    inputs="text",  # User input
    outputs="text",  # Model response
    title="Streaming Alif-1.0-8B-Instruct Chatbot 🚀",
    description="Enter a prompt and get a streamed response."
)

# Launch the Gradio app
demo.launch(share=True)