Spaces:
Sleeping
Sleeping
# Phoenix Evaluation | |
import os | |
from getpass import getpass | |
import nest_asyncio | |
nest_asyncio.apply() | |
import matplotlib.pyplot as plt | |
import openai | |
import pandas as pd | |
from pycm import ConfusionMatrix | |
from sklearn.metrics import classification_report | |
from phoenix.evals import ( | |
HALLUCINATION_PROMPT_RAILS_MAP, | |
HALLUCINATION_PROMPT_TEMPLATE, | |
OpenAIModel, | |
download_benchmark_dataset, | |
llm_classify, | |
) | |
import phoenix.evals.default_templates as templates | |
from phoenix.evals import ( | |
OpenAIModel, | |
download_benchmark_dataset, | |
llm_classify, | |
) | |
from phoenix.evals import ( | |
RAG_RELEVANCY_PROMPT_RAILS_MAP, | |
RAG_RELEVANCY_PROMPT_TEMPLATE, | |
OpenAIModel, | |
download_benchmark_dataset, | |
llm_classify, | |
) | |
from phoenix.evals import ( | |
CODE_READABILITY_PROMPT_RAILS_MAP, | |
CODE_READABILITY_PROMPT_TEMPLATE, | |
OpenAIModel, | |
download_benchmark_dataset, | |
llm_classify, | |
) | |
from phoenix.evals import ( | |
TOXICITY_PROMPT_RAILS_MAP, | |
TOXICITY_PROMPT_TEMPLATE, | |
OpenAIModel, | |
download_benchmark_dataset, | |
llm_classify, | |
) | |
from phoenix.evals import ( | |
QA_PROMPT_RAILS_MAP, | |
QA_PROMPT_TEMPLATE, | |
OpenAIModel, | |
download_benchmark_dataset, | |
llm_classify, | |
) | |
from phoenix.evals.default_templates import ( | |
REFERENCE_LINK_CORRECTNESS_PROMPT_RAILS_MAP, | |
REFERENCE_LINK_CORRECTNESS_PROMPT_TEMPLATE | |
) | |
from phoenix.evals import ( | |
OpenAIModel, | |
download_benchmark_dataset, | |
llm_classify, | |
llm_generate, | |
USER_FRUSTRATION_PROMPT_RAILS_MAP, | |
USER_FRUSTRATION_PROMPT_TEMPLATE, | |
) | |
from phoenix.evals import ( | |
SQL_GEN_EVAL_PROMPT_TEMPLATE, | |
SQL_GEN_EVAL_PROMPT_RAILS_MAP | |
) | |
def phoenix_eval(metrics, openai_api_key, df): | |
import os | |
os.environ["OPENAI_API_KEY"] = openai_api_key | |
model = OpenAIModel(model="gpt-3.5-turbo", temperature=0.25) | |
# Rename columns to match expected input names for evaluation | |
df.rename(columns={"question": "input", "answer": "output", "cleaned_context": "reference"}, inplace=True) | |
# Define a dictionary of metric configurations | |
metric_mappings = { | |
"hallucination": (HALLUCINATION_PROMPT_TEMPLATE, HALLUCINATION_PROMPT_RAILS_MAP, "Hallucination"), | |
"toxicity": (TOXICITY_PROMPT_TEMPLATE, TOXICITY_PROMPT_RAILS_MAP, "Toxicity"), | |
"relevance": (RAG_RELEVANCY_PROMPT_TEMPLATE, RAG_RELEVANCY_PROMPT_RAILS_MAP, "Relevancy"), | |
"Q&A": (QA_PROMPT_TEMPLATE, QA_PROMPT_RAILS_MAP, "Q&A_eval"), | |
} | |
# Loop over each metric in the provided metrics list | |
for metric in metrics: | |
if metric in metric_mappings: | |
template, rails_map, column_name = metric_mappings[metric] | |
rails = list(rails_map.values()) | |
# Perform classification and add results to a new column for the current metric | |
classifications = llm_classify(dataframe=df, template=template, model=model, rails=rails, concurrency=20)["label"].tolist() | |
df[column_name] = classifications | |
else: | |
print(f"Warning: Metric '{metric}' is not supported.") | |
# Rename columns back to their original names | |
df.rename(columns={"input": "question", "output": "answer", "reference": "context"}, inplace=True) | |
return df | |