File size: 9,681 Bytes
ce15bd8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
"""

rag_pipeline_utils.py



This python script implements various classes useful for a RAG pipeline.



Currently I have implemented:



   Text splitting

      SimpleTextSplitter: uses RecursiveTextSplitter

      SemanticTextSplitter: uses SemanticChunker (different threshold types can be used)



   VectorStore

      currently only sets up Qdrant vector store in memory

   

   AdvancedRetriever

      simple retriever is a special case - 

      advanced retriever - currently implemented MultiQueryRetriever



"""

from operator import itemgetter
from typing import List

from langchain_core.runnables import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore

from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document
from datasets import Dataset

from ragas import evaluate


def load_all_pdfs(list_of_pdf_files: List[str]) -> List[Document]:
    alldocs = []
    for pdffile in list_of_pdf_files:
        thisdoc = PyMuPDFLoader(file_path=pdffile).load()
        print(f'loaded {pdffile} with {len(thisdoc)} pages ')
        alldocs.extend(thisdoc)
    print(f'loaded all files: total number of pages: {len(alldocs)} ')
    return alldocs


class SimpleTextSplitter:
    def __init__(self, 

                 chunk_size, 

                 chunk_overlap, 

                 documents):
       self.chunk_size = chunk_size
       self.chunk_overlap = chunk_overlap
       self.documents = documents
       return
    
    def split_text(self):
       text_splitter = RecursiveCharacterTextSplitter(
          chunk_size=self.chunk_size,
          chunk_overlap=self.chunk_overlap
       )
       all_splits = text_splitter.split_documents(self.documents)
       return all_splits


class SemanticTextSplitter:
    def __init__(self, 

                 llm_embeddings=OpenAIEmbeddings(), 

                 threshold_type="interquartile", 

                 documents=None):
       self.llm_embeddings = llm_embeddings
       self.threshold_type = threshold_type
       self.documents = documents
       return
    
    def split_text(self):
       text_splitter = SemanticChunker(
          embeddings=self.llm_embeddings,
          breakpoint_threshold_type="interquartile"
       )

       print(f'loaded {len(self.documents)} to be split ')
       all_splits = text_splitter.split_documents(self.documents)
       print(f'returning docs split into {len(all_splits)} chunks ')
       return all_splits


class VectorStore:
    def __init__(self,

                 location,

                 name,

                 documents,

                 size,

                 embedding=OpenAIEmbeddings()):
       self.location = location
       self.name = name
       self.size = size
       self.documents = documents
       self.embedding = embedding

       self.qdrant_client = QdrantClient(self.location)
       self.qdrant_client.create_collection(
          collection_name=self.name,
          vectors_config=VectorParams(size=self.size, distance=Distance.COSINE),
       )
       return
    
    def set_up_vectorstore(self):
       self.qdrant_vector_store = QdrantVectorStore(
          client=self.qdrant_client,
          collection_name=self.name,
          embedding=self.embedding
       )

       self.qdrant_vector_store.add_documents(self.documents)
       return self


class AdvancedRetriever:
    def __init__(self, 

                 vectorstore):
        self.vectorstore = vectorstore
        return

    def set_up_simple_retriever(self):
        simple_retriever = self.vectorstore.as_retriever(
            search_type='similarity', 
            search_kwargs={
                'k': 5
            }
        )
        return simple_retriever
    
    def set_up_multi_query_retriever(self, llm):
        retriever = self.set_up_simple_retriever()
        advanced_retriever = MultiQueryRetriever.from_llm(
            retriever=retriever, llm=llm
        )
        return advanced_retriever


def run_and_eval_rag_pipeline(location, collection_name, embed_dim, text_splits, embeddings,

                              prompt, qa_llm, metrics, test_df):
    """

    Helper function that runs and evaluates different rag pipelines

        based on different text_splits presented to the pipeline

    """
    # vector store
    vs = VectorStore(location=location, 
                     name=collection_name, 
                     documents=text_splits,
                     size=embed_dim, 
                     embedding=embeddings)

    qdvs = vs.set_up_vectorstore().qdrant_vector_store

    # retriever
    retriever = AdvancedRetriever(vectorstore=qdvs).set_up_simple_retriever()

    # q&a chain using LCEL
    retrieval_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | qa_llm, "context": itemgetter("context")}
    )

    # get questions, and ground-truth
    test_questions = test_df["question"].values.tolist()
    test_groundtruths = test_df["ground_truth"].values.tolist()


    # run RAG pipeline
    answers = []
    contexts = []

    for question in test_questions:
        response = retrieval_chain.invoke({"question" : question})
        answers.append(response["response"].content)
        contexts.append([context.page_content for context in response["context"]])

    # Save RAG pipeline results to HF Dataset object
    response_dataset = Dataset.from_dict({
        "question" : test_questions,
        "answer" : answers,
        "contexts" : contexts,
        "ground_truth" : test_groundtruths
    })

    # Run RAGAS Evaluation - using metrics
    results = evaluate(response_dataset, metrics)

    # save results to df
    results_df = results.to_pandas()

    return results, results_df


def set_up_rag_pipeline(location, collection_name, 

                        embeddings, embed_dim, 

                        prompt, qa_llm, 

                        text_splits,):
    """

    Helper function that sets up a RAG pipeline

    Inputs

        location:           memory or persistent store

        collection_name:    name of collection, string

        embeddings:         object referring to embeddings to be used

        embed_dim:          embedding dimension

        prompt:             prompt used in RAG pipeline

        qa_llm:             LLM used to generate response

        text_splits:        list containing text splits



    

    Returns a retrieval chain

    """
    # vector store
    vs = VectorStore(location=location, 
                     name=collection_name, 
                     documents=text_splits,
                     size=embed_dim, 
                     embedding=embeddings)

    qdvs = vs.set_up_vectorstore().qdrant_vector_store

    # retriever
    retriever = AdvancedRetriever(vectorstore=qdvs).set_up_simple_retriever()

    # q&a chain using LCEL
    retrieval_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | qa_llm, "context": itemgetter("context")}
    )

    return retrieval_chain


def test_rag_pipeline(retrieval_chain, list_of_questions):
    """

    Tests RAG pipeline

    Inputs

        retrieval_chain:    retrieval chain

        list_of_questions:  list of questions to use to test RAG pipeline

    Output

        List of RAG-pipeline-generated responses to each question

    """
    all_answers = []
    for i, question in enumerate(list_of_questions):
        response = retrieval_chain.invoke({'question': question})
        answer = response["response"].content
        all_answers.append(answer)
    return all_answers


def get_vibe_check_on_list_of_questions(collection_name,

                                        embeddings, embed_dim,

                                        prompt, llm, text_splits,

                                        list_of_questions):
    """

    HELPER FUNCTION

    set up retrieval chain for each scenario and print out results

    of the q_and_a for any list of questions

    """

    # set up baseline retriever
    retrieval_chain = \
        set_up_rag_pipeline(location=":memory:", collection_name=collection_name,
                            embeddings=embeddings, embed_dim=embed_dim, 
                            prompt=prompt, qa_llm=llm,
                            text_splits=text_splits)
                            
    # run RAG pipeline and get responses
    answers = test_rag_pipeline(retrieval_chain, list_of_questions)

    # create question, answer tuples
    q_and_a = [(x, y) for x, y in zip(list_of_questions, answers)]

    # print out question/answer pairs to review the performance of the pipeline
    for i, item in enumerate(q_and_a):
        print('=================')
        print(f'=====question number: {i} =============')
        print(item[0])
        print(item[1])

    return retrieval_chain, q_and_a