Spaces:
Sleeping
Sleeping
File size: 1,777 Bytes
fa1332c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import os
import ragas
import pandas as pd
from datasets import Dataset, load_dataset
from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
from langchain.embeddings import AzureOpenAIEmbeddings, OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas import evaluate
from ragas.metrics.critique import harmfulness
from ragas.metrics import (
context_precision,
answer_relevancy,
faithfulness,
context_recall,
context_relevancy)
from ragas.metrics._answer_correctness import answer_correctness
from ragas.metrics._answer_similarity import answer_similarity
# Ragas Evaluation
def ragas_eval(metrics, openai_api_key, df):
os.environ["OPENAI_API_KEY"] = openai_api_key
llm = ChatOpenAI()
embeddings = OpenAIEmbeddings()
df.rename(columns={"context": "contexts", "ground_truths": "ground_truth"}, inplace=True)
df["contexts"] = df["contexts"].apply(lambda x: [x])
eval_data = Dataset.from_pandas(df)
metric_mappings = {
"answer_correctness": answer_correctness,
"answer_relevancy": answer_relevancy,
"faithfulness": faithfulness,
"context_precision": context_precision,
"context_recall": context_recall,
"context_relevancy": context_relevancy,
"answer_similarity": answer_similarity,
}
for metric in metrics:
if metric in metric_mappings:
result = evaluate(eval_data, metrics=[metric_mappings[metric]], llm=llm, embeddings=embeddings, raise_exceptions=False)
df2 = result.to_pandas()
new_columns = [col for col in df2.columns if col not in df.columns]
# Append only the new columns to final_df
for col in new_columns:
df[col] = df2[col]
return df |