In [1]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!mkdir static/
!mkdir static/training_data
!curl https://python.langchain.com/docs/tutorials/rag/ -o static/training_data/langchain_rag_tutorial.html

from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import BSHTMLLoader

path = "static/training_data/"
text_loader = DirectoryLoader(path, glob="*.html", loader_cls=BSHTMLLoader)
docs = text_loader.load()
len(docs)

mkdir: static/: File exists
mkdir: static/training_data: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  340k  100  340k    0     0  2188k      0 --:--:-- --:--:-- --:--:-- 2196k


1

In [3]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

Generating personas: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.24s/it]                                           
Generating Scenarios: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:05<00:00,  2.76s/it]
Generating Samples: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:56<00:00,  5.68s/it]


In [4]:
df = dataset.to_pandas()

In [5]:
df

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,Cud yu explane how Pydantic is used in LangCha...,[Build a Retrieval Augmented Generation (RAG) ...,The context mentions 'How to use LangChain wit...,single_hop_specifc_query_synthesizer
1,how langsmith help when buildin rag apps with ...,[the most powerful applications enabled by LLM...,LangSmith can help trace and understand your a...,single_hop_specifc_query_synthesizer
2,Wht is RAG in the context of AI applcations?,[Retrieval and generation: the actual RAG chai...,"RAG, or Retrieval and Generation, is a process...",single_hop_specifc_query_synthesizer
3,How does LangChain facilitate document retriev...,[Detailed walkthrough‚Äã Let‚Äôs go through the ab...,LangChain facilitates document retrieval and g...,single_hop_specifc_query_synthesizer
4,How does LangGraph enhance the development of ...,"[a TypedDict, but can also be a Pydantic BaseM...",LangGraph enhances the development of RAG appl...,single_hop_specifc_query_synthesizer
5,How does the LangGraph platform enhance the de...,"[<1-hop>\n\na TypedDict, but can also be a Pyd...",The LangGraph platform enhances the developmen...,multi_hop_specific_query_synthesizer
6,How does LangChain utilize document loaders an...,[<1-hop>\n\nRetrieval and generation: the actu...,"LangChain utilizes document loaders, such as t...",multi_hop_specific_query_synthesizer
7,"How does LangChain, Inc. facilitate the develo...",[<1-hop>\n\nstr query: Search context: List[Do...,"LangChain, Inc. facilitates the development of...",multi_hop_specific_query_synthesizer
8,How does the RAG technique facilitate sophisti...,[<1-hop>\n\nthe most powerful applications ena...,The RAG (Retrieval Augmented Generation) techn...,multi_hop_specific_query_synthesizer
9,How can LangChain JS/TS be utilized to build a...,[<1-hop>\n\nRetrieval and generation: the actu...,LangChain JS/TS can be utilized to build a Ret...,multi_hop_specific_query_synthesizer


In [6]:
# print reference_context for the first row
df.iloc[0]['reference_contexts']


['Build a Retrieval Augmented Generation (RAG) App: Part 1 | ü¶úÔ∏èüîó LangChain Skip to main contentJoin us at Interrupt: The Agent AI Conference by LangChain on May 13 & 14 in San Francisco!IntegrationsAPI ReferenceMoreContributingPeopleError referenceLangSmithLangGraphLangChain HubLangChain JS/TSv0.3v0.3v0.2v0.1üí¨SearchIntroductionTutorialsBuild a Question Answering application over a Graph DatabaseTutorialsBuild a simple LLM application with chat models and prompt templatesBuild a ChatbotBuild a Retrieval Augmented Generation (RAG) App: Part 2Build an Extraction ChainBuild an AgentTaggingBuild a Retrieval Augmented Generation (RAG) App: Part 1Build a semantic search engineBuild a Question/Answering system over SQL dataSummarize TextHow-to guidesHow-to guidesHow to use tools in a chainHow to use a vectorstore as a retrieverHow to add memory to chatbotsHow to use example selectorsHow to add a semantic layer over graph databaseHow to invoke runnables in parallelHow to stream chat 

In [7]:
len(df)

10

In [8]:
dataset.to_pandas().iloc[0]

user_input            Cud yu explane how Pydantic is used in LangCha...
reference_contexts    [Build a Retrieval Augmented Generation (RAG) ...
reference             The context mentions 'How to use LangChain wit...
synthesizer_name                   single_hop_specifc_query_synthesizer
Name: 0, dtype: object

In [None]:
# ***** DO NOT RUN THIS CELL AGAIN. IT WILL OVERWRITE THE RESULTS AND TAKE A LONG TIME TO RUN. *****

from backend.app.problem_generator import ProblemGenerationPipeline

problem_generator = ProblemGenerationPipeline(return_context=True)

# # test out on first row
# test_row = dataset.to_pandas().iloc[0]
# response = problem_generator.generate_problems({"query" : test_row['user_input']})
# print(response)
# print(response["response"])
# print(response["context"])

for test_row in dataset:
  response = problem_generator.generate_problems({"query" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

# result = evaluate(
#     dataset=evaluation_dataset,
#     metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
#     llm=evaluator_llm,
#     run_config=custom_run_config
# )
# result

{'response': '{\n    "questions": [\n        "What is the technique used by Q&A chatbots to answer questions about specific source information?",\n        "What is the main focus of Part 1 of the tutorial on building a Retrieval Augmented Generation (RAG) App?",\n        "What are the two main components of a typical RAG application?",\n        "How does LangSmith assist in understanding and tracing the RAG application\'s complexity?",\n        "For what type of data does the tutorial focus on Q&A, and where should one look for RAG over structured data?"\n    ]\n}', 'context': [Document(metadata={'source': 'static/data/langchain_rag_tutorial.html', '_id': 'fbd91b9b5d134d85bd8ef1f8c00633fd', '_collection_name': 'extending_context_window_llama_3'}, page_content='Tutorials\n\nBuild a Retrieval Augmented Generation (RAG) App: Part 1\n\nBuild a Retrieval Augmented Generation (RAG) App: Part 1\n\nOne of the most powerful applications enabled by LLMs is sophisticated question-answering (Q&A) 

Evaluating:  38%|‚ñà‚ñà‚ñà‚ñä      | 23/60 [02:09<10:37, 17.22s/it]Exception raised in Job[7]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-RPQcqL3OmEI37MmtuiAIn2UP on tokens per min (TPM): Limit 30000, Used 28807, Requested 1497. Please try again in 608ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Evaluating:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 27/60 [02:27<04:24,  8.01s/it]Exception raised in Job[19]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-RPQcqL3OmEI37MmtuiAIn2UP on tokens per min (TPM): Limit 30000, Used 29408, Requested 1527. Please try again in 1.87s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Evaluating:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 29/60 [02:41<03:55,  7.60s/it]Exception raised

{'context_recall': 0.4722, 'faithfulness': 0.3333, 'factual_correctness': 0.3030, 'answer_relevancy': 0.6319, 'context_entity_recall': 0.3208, 'noise_sensitivity_relevant': 0.1333}

Baseline Results:


`{'context_recall': 0.4722, 'faithfulness': 0.3333, 'factual_correctness': 0.3030, 'answer_relevancy': 0.6319, 'context_entity_recall': 0.3208, 'noise_sensitivity_relevant': 0.1333}`

In [9]:
hf_username = "Rsr2425"
FINETUNED_MODEL_ID = f"{hf_username}/simplify-ft-arctic-embed-l"

from backend.app.problem_generator import ProblemGenerationPipeline

problem_generator = ProblemGenerationPipeline(return_context=True, embedding_model_id=FINETUNED_MODEL_ID)
for test_row in dataset:
  response = problem_generator.generate_problems({"query" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
result

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ryanrodriguez/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/ryanrodriguez/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
Some weights of BertModel were not initialized from the model checkpoint at Rsr2425/simplify-ft-arctic-embed-l and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 27/60 [02:21<03:53,  7.07s/it]Exception raised in Job[13]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-RPQcqL3OmEI37MmtuiAIn2UP on tokens per min (TPM): Limit 30000, Used 29854, Requested 1458. Please try again in 2.624s. Visit https://platform

{'context_recall': 0.3000, 'faithfulness': 0.8667, 'factual_correctness': 0.3322, 'answer_relevancy': 0.7749, 'context_entity_recall': 0.3884, 'noise_sensitivity_relevant': 0.2448}

Finetuned Results:

`{'context_recall': 0.3000, 'faithfulness': 0.8667, 'factual_correctness': 0.3322, 'answer_relevancy': 0.7749, 'context_entity_recall': 0.3884, 'noise_sensitivity_relevant': 0.2448}`

In [13]:
print("Base Model Results")
{'context_recall': 0.4722, 'faithfulness': 0.3333, 'factual_correctness': 0.3030, 'answer_relevancy': 0.6319, 'context_entity_recall': 0.3208, 'noise_sensitivity_relevant': 0.1333}

Base Model Results


{'context_recall': 0.4722,
 'faithfulness': 0.3333,
 'factual_correctness': 0.303,
 'answer_relevancy': 0.6319,
 'context_entity_recall': 0.3208,
 'noise_sensitivity_relevant': 0.1333}

In [10]:
print("Fine-tuned Model Results")
{'context_recall': 0.3000, 'faithfulness': 0.8667, 'factual_correctness': 0.3322, 'answer_relevancy': 0.7749, 'context_entity_recall': 0.3884, 'noise_sensitivity_relevant': 0.2448}

Fine-tuned Model Results


{'context_recall': 0.3,
 'faithfulness': 0.8667,
 'factual_correctness': 0.3322,
 'answer_relevancy': 0.7749,
 'context_entity_recall': 0.3884,
 'noise_sensitivity_relevant': 0.2448}