Spaces:

Fawaz0ibra
/

NDMO_english_assistant

Running

App Files Files Community

Fawaz0ibra commited on 9 days ago

Commit

c2d9892

verified ·

1 Parent(s): 6b771c5

Update chain_setup.py

Browse files

Files changed (1) hide show

chain_setup.py +28 -38

chain_setup.py CHANGED Viewed

@@ -1,53 +1,43 @@
 from langchain.chains import ConversationalRetrievalChain
-from langchain.llms import HuggingFacePipeline
 from langchain.memory import ConversationBufferMemory
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from transformers import BitsAndBytesConfig
-import transformers
-import torch
-import os
 def load_llm():
-    model_id = "Qwen/Qwen2.5-7B-Instruct"  # (Needs to be a 4-bit or 8-bit variant)
-    # bnb_config = BitsAndBytesConfig(
-    #     load_in_4bit=True,
-    #     bnb_4bit_compute_dtype=torch.float16,
-    #     bnb_4bit_quant_type="nf4"
-    # )
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        trust_remote_code=True,
-        # quantization_config=bnb_config,
-        device_map="auto"
     )
-    # offload_folder = "offload"
-    # os.makedirs(offload_folder, exist_ok=True)
-    # model = transformers.AutoModelForCausalLM.from_pretrained(
-    #     model_id,
-    #     trust_remote_code=True,
-    #     device_map="auto",
-    #     offload_folder=offload_folder
-    # )
-    pipe = transformers.pipeline(
-        "text-generation",
-        model=model,
-        tokenizer=tokenizer,
-        max_new_tokens=512
     )
-    return pipe
 def build_conversational_chain(vectorstore):
     """
-    Creates a ConversationalRetrievalChain using the HuggingFacePipeline based LLM
     and a ConversationBufferMemory for multi-turn Q&A.
     """
     llm = load_llm()
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         return_messages=True
@@ -57,7 +47,7 @@ def build_conversational_chain(vectorstore):
         llm=llm,
         retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
         memory=memory,
-        verbose=True  # optional: enables debug logs
     )
     return qa_chain

+import os
+from huggingface_hub import hf_hub_download
+from langchain.llms import LlamaCpp
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
 def load_llm():
+    """
+    Downloads a Qwen2.5 GGUF model and loads it via llama-cpp.
+    """
+    # 1) Download the GGUF model from Hugging Face
+    model_file = hf_hub_download(
+        repo_id="bartowski/Qwen2.5-7B-Instruct-GGUF",  # Non-math version
+        filename="Qwen2.5-7B-Instruct-Q4_K_M.gguf",    # Example file
+        local_dir="./models",
+        local_dir_use_symlinks=False
     )
+    # 2) Load the model with llama-cpp via LangChain’s LlamaCpp
+    llm = LlamaCpp(
+        model_path=model_file,
+        # If you have a GPU that supports flash attention, set flash to True
+        flash_attn=False,
+        n_ctx=8192,     # Large context if you have enough RAM
+        n_batch=1024,   # Adjust based on your system’s memory
+        # Qwen typically uses ChatML (<|im_start|> / <|im_end|> tokens)
+        # Setting chat_format='chatml' helps the model handle chat roles
+        chat_format='chatml'
     )
+    return llm
 def build_conversational_chain(vectorstore):
     """
+    Creates a ConversationalRetrievalChain using the local llama-cpp-based LLM
     and a ConversationBufferMemory for multi-turn Q&A.
     """
     llm = load_llm()
+    # We'll store chat history in memory so the chain can handle multi-turn conversations
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         return_messages=True
         llm=llm,
         retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
         memory=memory,
+        verbose=True
     )
     return qa_chain