import os from huggingface_hub import hf_hub_download from langchain.llms import LlamaCpp from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory def load_llm(): """ Downloads a Qwen2.5 GGUF model and loads it via llama-cpp. """ # 1) Download the GGUF model from Hugging Face model_file = hf_hub_download( repo_id="bartowski/Qwen2.5-7B-Instruct-GGUF", # Non-math version filename="Qwen2.5-7B-Instruct-Q4_K_M.gguf", # Example file local_dir="./models", local_dir_use_symlinks=False ) # 2) Load the model with llama-cpp via LangChain’s LlamaCpp llm = LlamaCpp( model_path=model_file, flash_attn=False, n_ctx=2048, # or 4096 n_batch=512, # or even 256 chat_format='chatml' ) return llm def build_conversational_chain(vectorstore): """ Creates a ConversationalRetrievalChain using the local llama-cpp-based LLM and a ConversationBufferMemory for multi-turn Q&A. """ llm = load_llm() # We'll store chat history in memory so the chain can handle multi-turn conversations memory = ConversationBufferMemory( memory_key="chat_history", return_messages=True ) qa_chain = ConversationalRetrievalChain.from_llm( llm=llm, retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}), memory=memory, verbose=True ) return qa_chain