|
import os |
|
from huggingface_hub import hf_hub_download |
|
from langchain.llms import LlamaCpp |
|
from langchain.chains import ConversationalRetrievalChain |
|
from langchain.memory import ConversationBufferMemory |
|
|
|
def load_llm(): |
|
""" |
|
Downloads a Qwen2.5 GGUF model and loads it via llama-cpp. |
|
""" |
|
|
|
model_file = hf_hub_download( |
|
repo_id="bartowski/Qwen2.5-7B-Instruct-GGUF", |
|
filename="Qwen2.5-7B-Instruct-Q4_K_M.gguf", |
|
local_dir="./models", |
|
local_dir_use_symlinks=False |
|
) |
|
|
|
|
|
llm = LlamaCpp( |
|
model_path=model_file, |
|
flash_attn=False, |
|
n_ctx=2048, |
|
n_batch=512, |
|
chat_format='chatml' |
|
) |
|
|
|
|
|
return llm |
|
|
|
def build_conversational_chain(vectorstore): |
|
""" |
|
Creates a ConversationalRetrievalChain using the local llama-cpp-based LLM |
|
and a ConversationBufferMemory for multi-turn Q&A. |
|
""" |
|
llm = load_llm() |
|
|
|
|
|
memory = ConversationBufferMemory( |
|
memory_key="chat_history", |
|
return_messages=True |
|
) |
|
|
|
qa_chain = ConversationalRetrievalChain.from_llm( |
|
llm=llm, |
|
retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}), |
|
memory=memory, |
|
verbose=True |
|
) |
|
|
|
return qa_chain |
|
|