import os import gradio as gr from llama_cpp import Llama from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain.prompts import PromptTemplate # Initialize the embedding model embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True} ) # Load the existing Chroma vector store persist_directory = os.path.join(os.path.dirname(__file__), 'mydb') vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings) # Initialize the Llama model llm = Llama.from_pretrained( repo_id="bartowski/Llama-3.2-1B-Instruct-GGUF", filename="Llama-3.2-1B-Instruct-Q8_0.gguf", ) # Create the RAG prompt template template = """Answer the question based only on the following context: {context} Question: {question} Answer the question in a clear and concise way. If you cannot find the answer in the context, just say "I don't have enough information to answer this question." Make sure to: 1. Only use information from the provided context 2. Be concise and direct 3. If you're unsure, acknowledge it """ prompt = PromptTemplate.from_template(template) def respond( message, history, system_message, max_tokens, temperature, # top_p, ): # Build the messages list messages = [{"role": "system", "content": system_message}] for user_msg, assistant_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) # Search the vector store retriever = vectorstore.as_retriever(search_kwargs={"k": 5}) docs = retriever.get_relevant_documents(message) context = "\n\n".join([doc.page_content for doc in docs]) # Format the prompt final_prompt = prompt.format(context=context, question=message) # Add the formatted prompt to messages messages.append({"role": "user", "content": final_prompt}) # Generate response using the Llama model response = llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, # top_p=top_p, ) # Extract the assistant's reply assistant_reply = response['choices'][0]['message']['content'] return assistant_reply # Create Gradio Chat Interface demo = gr.ChatInterface( fn=respond, additional_inputs=[ gr.Textbox(value="You are a friendly chatbot.", label="System Message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens"), gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"), # gr.Slider( # minimum=0.1, # maximum=1.0, # value=0.95, # step=0.05, # label="Top-p (Nucleus Sampling)", # ), ], title="Document-Based QA with Llama", description="A PDF Chat interface powered by the Llama model.", examples=["What is a Computer?"], theme="default", ) if __name__ == "__main__": demo.launch()