import os
import gradio as gr
from llama_cpp import Llama
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import PromptTemplate

# Initialize the embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# Load the existing Chroma vector store
persist_directory = os.path.join(os.path.dirname(__file__), 'mydb')
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

# Initialize the Llama model
llm = Llama.from_pretrained(
    repo_id="bartowski/Llama-3.2-1B-Instruct-GGUF",
    filename="Llama-3.2-1B-Instruct-Q8_0.gguf",
)

# Create the RAG prompt template
template = """Answer the question based only on the following context:

{context}

Question: {question}

Answer the question in a clear and concise way. If you cannot find the answer in the context, just say "I don't have enough information to answer this question."

Make sure to:
1. Only use information from the provided context
2. Be concise and direct
3. If you're unsure, acknowledge it
"""

prompt = PromptTemplate.from_template(template)

def respond(
    message,
    history,
    system_message,
    max_tokens,
    temperature,
    # top_p,
):
    # Build the messages list
    messages = [{"role": "system", "content": system_message}]

    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})

    # Search the vector store
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    docs = retriever.get_relevant_documents(message)
    context = "\n\n".join([doc.page_content for doc in docs])

    # Format the prompt
    final_prompt = prompt.format(context=context, question=message)

    # Add the formatted prompt to messages
    messages.append({"role": "user", "content": final_prompt})

    # Generate response using the Llama model
    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        # top_p=top_p,
    )

    # Extract the assistant's reply
    assistant_reply = response['choices'][0]['message']['content']

    return assistant_reply

# Create Gradio Chat Interface
demo = gr.ChatInterface(
    fn=respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly chatbot.", label="System Message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
        # gr.Slider(
        #     minimum=0.1,
        #     maximum=1.0,
        #     value=0.95,
        #     step=0.05,
        #     label="Top-p (Nucleus Sampling)",
        # ),
    ],
    title="Document-Based QA with Llama",
    description="A PDF Chat interface powered by the Llama model.",
    examples=["What is a Computer?"],
    theme="default",
)

if __name__ == "__main__":
    demo.launch()