File size: 2,874 Bytes
ce15bd8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""

ragas_pipeline.py



Implements the core pipeline to generate test set for RAGAS.



"""

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.testset.generator import TestsetGenerator
from ragas import evaluate

from datasets import Dataset

from myutils.rag_pipeline_utils import SimpleTextSplitter, SemanticTextSplitter, VectorStore, AdvancedRetriever


class RagasPipeline:
    def __init__(self, generator_llm_model, critic_llm_model, embedding_model,

                 number_of_qa_pairs, 

                 chunk_size, chunk_overlap, documents,

                 distributions):
        self.generator_llm = ChatOpenAI(model=generator_llm_model)
        self.critic_llm = ChatOpenAI(model=critic_llm_model)
        self.embeddings = OpenAIEmbeddings(model=embedding_model)
        self.number_of_qa_pairs = number_of_qa_pairs

        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.documents = documents

        self.distributions = distributions

        self.generator = TestsetGenerator.from_langchain(
            self.generator_llm,
            self.critic_llm,
            self.embeddings
        )
        return
    
    def generate_testset(self):
        text_splitter = SimpleTextSplitter(
            chunk_size=self.chunk_size, 
            chunk_overlap=self.chunk_overlap, 
            documents=self.documents
        )
        ragas_text_splits = text_splitter.split_text()

        testset = self.generator.generate_with_langchain_docs(
            ragas_text_splits, 
            self.number_of_qa_pairs, 
            self.distributions
        )

        testset_df = testset.to_pandas()
        return testset_df

    def ragas_eval_of_rag_pipeline(self, retrieval_chain, ragas_questions, ragas_groundtruths, ragas_metrics):
        """

        Helper function that runs and evaluates different rag pipelines

            based on RAGAS test questions

        """

        # run RAG pipeline on RAGAS synthetic questions
        answers = []
        contexts = []

        for question in ragas_questions:
            response = retrieval_chain.invoke({"question" : question})
            answers.append(response["response"].content)
            contexts.append([context.page_content for context in response["context"]])

        # Save RAG pipeline results to HF Dataset object
        response_dataset = Dataset.from_dict({
            "question" : ragas_questions,
            "answer" : answers,
            "contexts" : contexts,
            "ground_truth" : ragas_groundtruths
        })

        # Run RAGAS Evaluation - using metrics
        results = evaluate(response_dataset, ragas_metrics)

        # save results to df
        results_df = results.to_pandas()

        return results, results_df