Fawaz0ibra commited on
Commit
c2d9892
·
verified ·
1 Parent(s): 6b771c5

Update chain_setup.py

Browse files
Files changed (1) hide show
  1. chain_setup.py +28 -38
chain_setup.py CHANGED
@@ -1,53 +1,43 @@
 
 
 
1
  from langchain.chains import ConversationalRetrievalChain
2
- from langchain.llms import HuggingFacePipeline
3
  from langchain.memory import ConversationBufferMemory
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
- from transformers import BitsAndBytesConfig
6
- import transformers
7
- import torch
8
- import os
9
 
10
  def load_llm():
11
- model_id = "Qwen/Qwen2.5-7B-Instruct" # (Needs to be a 4-bit or 8-bit variant)
12
- # bnb_config = BitsAndBytesConfig(
13
- # load_in_4bit=True,
14
- # bnb_4bit_compute_dtype=torch.float16,
15
- # bnb_4bit_quant_type="nf4"
16
- # )
17
-
18
- tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
19
- model = AutoModelForCausalLM.from_pretrained(
20
- model_id,
21
- trust_remote_code=True,
22
- # quantization_config=bnb_config,
23
- device_map="auto"
24
  )
25
-
26
- # offload_folder = "offload"
27
- # os.makedirs(offload_folder, exist_ok=True)
28
-
29
- # model = transformers.AutoModelForCausalLM.from_pretrained(
30
- # model_id,
31
- # trust_remote_code=True,
32
- # device_map="auto",
33
- # offload_folder=offload_folder
34
- # )
35
 
36
- pipe = transformers.pipeline(
37
- "text-generation",
38
- model=model,
39
- tokenizer=tokenizer,
40
- max_new_tokens=512
 
 
 
 
 
41
  )
42
- return pipe
43
-
 
44
  def build_conversational_chain(vectorstore):
45
  """
46
- Creates a ConversationalRetrievalChain using the HuggingFacePipeline based LLM
47
  and a ConversationBufferMemory for multi-turn Q&A.
48
  """
49
  llm = load_llm()
50
 
 
51
  memory = ConversationBufferMemory(
52
  memory_key="chat_history",
53
  return_messages=True
@@ -57,7 +47,7 @@ def build_conversational_chain(vectorstore):
57
  llm=llm,
58
  retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
59
  memory=memory,
60
- verbose=True # optional: enables debug logs
61
  )
62
 
63
  return qa_chain
 
1
+ import os
2
+ from huggingface_hub import hf_hub_download
3
+ from langchain.llms import LlamaCpp
4
  from langchain.chains import ConversationalRetrievalChain
 
5
  from langchain.memory import ConversationBufferMemory
 
 
 
 
 
6
 
7
  def load_llm():
8
+ """
9
+ Downloads a Qwen2.5 GGUF model and loads it via llama-cpp.
10
+ """
11
+ # 1) Download the GGUF model from Hugging Face
12
+ model_file = hf_hub_download(
13
+ repo_id="bartowski/Qwen2.5-7B-Instruct-GGUF", # Non-math version
14
+ filename="Qwen2.5-7B-Instruct-Q4_K_M.gguf", # Example file
15
+ local_dir="./models",
16
+ local_dir_use_symlinks=False
 
 
 
 
17
  )
 
 
 
 
 
 
 
 
 
 
18
 
19
+ # 2) Load the model with llama-cpp via LangChain’s LlamaCpp
20
+ llm = LlamaCpp(
21
+ model_path=model_file,
22
+ # If you have a GPU that supports flash attention, set flash to True
23
+ flash_attn=False,
24
+ n_ctx=8192, # Large context if you have enough RAM
25
+ n_batch=1024, # Adjust based on your system’s memory
26
+ # Qwen typically uses ChatML (<|im_start|> / <|im_end|> tokens)
27
+ # Setting chat_format='chatml' helps the model handle chat roles
28
+ chat_format='chatml'
29
  )
30
+
31
+ return llm
32
+
33
  def build_conversational_chain(vectorstore):
34
  """
35
+ Creates a ConversationalRetrievalChain using the local llama-cpp-based LLM
36
  and a ConversationBufferMemory for multi-turn Q&A.
37
  """
38
  llm = load_llm()
39
 
40
+ # We'll store chat history in memory so the chain can handle multi-turn conversations
41
  memory = ConversationBufferMemory(
42
  memory_key="chat_history",
43
  return_messages=True
 
47
  llm=llm,
48
  retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
49
  memory=memory,
50
+ verbose=True
51
  )
52
 
53
  return qa_chain