nnngoc commited on
Commit
07e173c
·
1 Parent(s): 4bb7b01

update utility

Browse files
Files changed (1) hide show
  1. utility.py +147 -0
utility.py CHANGED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # function support rag pipeline
2
+ from typing import List
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.retrievers.multi_vector import MultiVectorRetriever
5
+ from langchain.storage import InMemoryStore
6
+ import uuid
7
+ from langchain.document_loaders import TextLoader, DirectoryLoader
8
+ import os
9
+ from sentence_transformers.cross_encoder import CrossEncoder
10
+ import numpy as np
11
+ from langchain.schema import BaseRetriever, Document
12
+ from typing import List
13
+ from langchain.callbacks.manager import CallbackManagerForRetrieverRun
14
+ from langchain.vectorstores import VectorStore
15
+ from langchain.load import dumps, loads
16
+ from typing import Any
17
+
18
+
19
+ def load_data(data_path):
20
+ folders = os.listdir(data_path)
21
+ dir_loaders = []
22
+ loaded_documents = []
23
+
24
+ for folder in folders:
25
+ dir_loader = DirectoryLoader(os.path.join(data_path, folder), loader_cls=TextLoader)
26
+ dir_loaders.append(dir_loader)
27
+
28
+ for dir_loader in dir_loaders:
29
+ loaded_documents.extend(dir_loader.load())
30
+
31
+ return loaded_documents
32
+
33
+ def process_data(data: List[str], child_text_splitter, embedding, vectorstore_name: str) -> MultiVectorRetriever:
34
+
35
+ # The vectorstore to use to index the child chunks
36
+ vectorstore = Chroma(
37
+ collection_name=vectorstore_name,
38
+ embedding_function=embedding,
39
+ # collection_metadata={"hnsw:space": "cosine"}
40
+ )
41
+
42
+ # The storage layer for the parent documents
43
+ store = InMemoryStore()
44
+ id_key = "doc_id"
45
+
46
+ # The retriever (empty to start)
47
+ retriever = MultiVectorRetriever(
48
+ vectorstore=vectorstore,
49
+ docstore=store,
50
+ id_key=id_key,
51
+ search_kwargs={"k": 25}
52
+ )
53
+
54
+ doc_ids = [str(uuid.uuid4()) for _ in data]
55
+ sub_docs = []
56
+
57
+ for i, doc in enumerate(data):
58
+ _id = doc_ids[i]
59
+ _sub_docs = child_text_splitter.split_documents([doc])
60
+ for _doc in _sub_docs:
61
+ _doc.metadata[id_key] = _id
62
+ sub_docs.extend(_sub_docs)
63
+
64
+ retriever.vectorstore.add_documents(sub_docs)
65
+ retriever.docstore.mset(list(zip(doc_ids, data)))
66
+
67
+ return vectorstore, retriever
68
+
69
+ class CustomRetriever(BaseRetriever):
70
+ # vectorstores:Chroma
71
+ retriever:Any
72
+
73
+ def reciprocal_rank_fusion(self, results: list[list], k=60):
74
+ """ Reciprocal_rank_fusion that takes multiple lists of ranked documents
75
+ and an optional parameter k used in the RRF formula """
76
+
77
+ # Initialize a dictionary to hold fused scores for each unique document
78
+ fused_scores = {}
79
+
80
+ # Iterate through each list of ranked documents
81
+ for docs in results:
82
+ # Iterate through each document in the list, with its rank (position in the list)
83
+ for rank, doc in enumerate(docs):
84
+ # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
85
+ doc_str = dumps(doc)
86
+ # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
87
+ if doc_str not in fused_scores:
88
+ fused_scores[doc_str] = 0
89
+ # Retrieve the current score of the document, if any
90
+ previous_score = fused_scores[doc_str]
91
+ # Update the score of the document using the RRF formula: 1 / (rank + k)
92
+ fused_scores[doc_str] += 1 / (rank + k)
93
+
94
+ # Sort the documents based on their fused scores in descending order to get the final reranked results
95
+ reranked_results = [
96
+ (loads(doc), score)
97
+ for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True) #[:10] #Top 10
98
+ ]
99
+
100
+ # Return the reranked results as a list of tuples, each containing the document and its fused score
101
+ rr_list=[]
102
+ for doc in reranked_results:
103
+ rr_list.append(doc[0])
104
+ return rr_list
105
+
106
+
107
+ def _get_relevant_documents(
108
+ self, queries: list, *, run_manager: CallbackManagerForRetrieverRun
109
+ ) -> List[Document]:
110
+ # Use your existing retriever to get the documents
111
+ documents=[]
112
+ for i in range(len(queries)):
113
+ document = self.retriever.get_relevant_documents(queries[i], callbacks=run_manager.get_child())
114
+ documents.append(document)
115
+
116
+ unique_documents = self.reciprocal_rank_fusion(documents)
117
+
118
+ # Get page content
119
+ docs_content = []
120
+ for i in range(len(unique_documents)):
121
+ docs_content.append(unique_documents[i].page_content)
122
+
123
+ # model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
124
+ # model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-641M')
125
+ # model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-642M-2') *
126
+ # model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-644M-1')
127
+ # model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-32-2M-2')
128
+ # model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-32-5M-1')
129
+ model = CrossEncoder('nnngoc/ms-marco-MiniLM-L-6-v2-32-6M-1')
130
+
131
+ # So we create the respective sentence combinations
132
+ sentence_combinations = [[queries[0], document] for document in docs_content]
133
+
134
+ # Compute the similarity scores for these combinations
135
+ similarity_scores = model.predict(sentence_combinations)
136
+
137
+ # Sort the scores in decreasing order
138
+ sim_scores_argsort = reversed(np.argsort(similarity_scores))
139
+
140
+ # Store the rerank document in new list
141
+ docs = []
142
+ for idx in sim_scores_argsort:
143
+ docs.append(unique_documents[idx])
144
+
145
+ docs_top_10 = docs[0:10]
146
+
147
+ return docs_top_10