Goodnight7 commited on
Commit
20935cd
·
verified ·
1 Parent(s): d9bfe6a

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +105 -0
utils.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils
2
+
3
+ from langchain_chroma import Chroma
4
+ from langchain_nomic.embeddings import NomicEmbeddings
5
+ from langchain_core.documents import Document
6
+ from langchain.retrievers.document_compressors import CohereRerank
7
+ from langchain.retrievers import ContextualCompressionRetriever
8
+ from langchain.retrievers import BM25Retriever, EnsembleRetriever
9
+ from langchain_groq import ChatGroq
10
+ from dotenv import load_dotenv
11
+ from langchain_core.prompts import ChatPromptTemplate
12
+ from langchain_core.runnables import Runnable, RunnableMap
13
+ from langchain.schema import BaseRetriever
14
+ from qdrant_client import models
15
+
16
+ load_dotenv()
17
+ #Retriever
18
+ def retriever(n_docs=5):
19
+ vector_database_path = "knowledge-base"
20
+
21
+ embeddings_model = NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local")
22
+
23
+
24
+ vectorstore = Chroma(collection_name="knowledge-base",
25
+ persist_directory=vector_database_path,
26
+ embedding_function=embeddings_model)
27
+
28
+ vs_retriever = vectorstore.as_retriever(k=n_docs)
29
+
30
+ texts = vectorstore.get()['documents']
31
+ metadatas = vectorstore.get()["metadatas"]
32
+
33
+ documents = []
34
+ for i in range(len(texts)):
35
+ doc = Document(page_content=texts[i], metadata=metadatas[i])
36
+ documents.append(doc)
37
+
38
+ keyword_retriever = BM25Retriever.from_documents(documents)
39
+ keyword_retriever.k = n_docs
40
+
41
+ ensemble_retriever = EnsembleRetriever(retrievers=[vs_retriever,keyword_retriever],
42
+ weights=[0.5, 0.5])
43
+
44
+ compressor = CohereRerank(model="rerank-english-v3.0")
45
+ retriever = ContextualCompressionRetriever(
46
+ base_compressor=compressor, base_retriever=ensemble_retriever
47
+ )
48
+
49
+ return retriever
50
+
51
+ #Retriever prompt
52
+ rag_prompt = """You are an assistant for question-answering tasks.
53
+ The questions that you will be asked will mainly be about SUP'COM (also known as Higher School Of Communication Of Tunis).
54
+ Here is the context to use to answer the question:
55
+ {context}
56
+ Think carefully about the above context.
57
+ Now, review the user question:
58
+ {input}
59
+ Provide an answer to this questions using only the above context.
60
+ Answer:"""
61
+
62
+ # Post-processing
63
+ def format_docs(docs):
64
+ return "\n\n".join(doc.page_content for doc in docs)
65
+
66
+ #RAG chain
67
+ def get_expression_chain(retriever: BaseRetriever, model_name="llama-3.1-70b-versatile", temp=0 ) -> Runnable:
68
+ """Return a chain defined primarily in LangChain Expression Language"""
69
+ def retrieve_context(input_text):
70
+ # Use the retriever to fetch relevant documents
71
+ docs = retriever.get_relevant_documents(input_text)
72
+ return format_docs(docs)
73
+
74
+ ingress = RunnableMap(
75
+ {
76
+ "input": lambda x: x["input"],
77
+ "context": lambda x: retrieve_context(x["input"]),
78
+ }
79
+ )
80
+ prompt = ChatPromptTemplate.from_messages(
81
+ [
82
+ (
83
+ "system",
84
+ rag_prompt
85
+ )
86
+ ]
87
+ )
88
+ llm = ChatGroq(model=model_name, temperature=temp)
89
+
90
+ chain = ingress | prompt | llm
91
+ return chain
92
+
93
+ embedding_model = NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local")
94
+ #Generate embeddings for a given text
95
+ def get_embeddings(text):
96
+ return embedding_model.embed([text], task_type='search_document')[0]
97
+
98
+
99
+ # Create or connect to a Qdrant collection
100
+ def create_qdrant_collection(client, collection_name):
101
+ if collection_name not in client.get_collections().collections:
102
+ client.create_collection(
103
+ collection_name=collection_name,
104
+ vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE)
105
+ )