sango07 commited on
Commit
271d2c1
·
verified ·
1 Parent(s): fa1332c

Create phoenix_code.py

Browse files
Files changed (1) hide show
  1. phoenix_code.py +108 -0
phoenix_code.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phoenix Evaluation
2
+ import os
3
+ from getpass import getpass
4
+ import nest_asyncio
5
+ nest_asyncio.apply()
6
+
7
+ import matplotlib.pyplot as plt
8
+ import openai
9
+ import pandas as pd
10
+ from pycm import ConfusionMatrix
11
+ from sklearn.metrics import classification_report
12
+
13
+ from phoenix.evals import (
14
+ HALLUCINATION_PROMPT_RAILS_MAP,
15
+ HALLUCINATION_PROMPT_TEMPLATE,
16
+ OpenAIModel,
17
+ download_benchmark_dataset,
18
+ llm_classify,
19
+ )
20
+ import phoenix.evals.default_templates as templates
21
+ from phoenix.evals import (
22
+ OpenAIModel,
23
+ download_benchmark_dataset,
24
+ llm_classify,
25
+ )
26
+
27
+ from phoenix.evals import (
28
+ RAG_RELEVANCY_PROMPT_RAILS_MAP,
29
+ RAG_RELEVANCY_PROMPT_TEMPLATE,
30
+ OpenAIModel,
31
+ download_benchmark_dataset,
32
+ llm_classify,
33
+ )
34
+ from phoenix.evals import (
35
+ CODE_READABILITY_PROMPT_RAILS_MAP,
36
+ CODE_READABILITY_PROMPT_TEMPLATE,
37
+ OpenAIModel,
38
+ download_benchmark_dataset,
39
+ llm_classify,
40
+ )
41
+
42
+
43
+ from phoenix.evals import (
44
+ TOXICITY_PROMPT_RAILS_MAP,
45
+ TOXICITY_PROMPT_TEMPLATE,
46
+ OpenAIModel,
47
+ download_benchmark_dataset,
48
+ llm_classify,
49
+ )
50
+
51
+ from phoenix.evals import (
52
+ QA_PROMPT_RAILS_MAP,
53
+ QA_PROMPT_TEMPLATE,
54
+ OpenAIModel,
55
+ download_benchmark_dataset,
56
+ llm_classify,
57
+ )
58
+
59
+ from phoenix.evals.default_templates import (
60
+ REFERENCE_LINK_CORRECTNESS_PROMPT_RAILS_MAP,
61
+ REFERENCE_LINK_CORRECTNESS_PROMPT_TEMPLATE
62
+ )
63
+ from phoenix.evals import (
64
+ OpenAIModel,
65
+ download_benchmark_dataset,
66
+ llm_classify,
67
+ llm_generate,
68
+ USER_FRUSTRATION_PROMPT_RAILS_MAP,
69
+ USER_FRUSTRATION_PROMPT_TEMPLATE,
70
+ )
71
+ from phoenix.evals import (
72
+ SQL_GEN_EVAL_PROMPT_TEMPLATE,
73
+ SQL_GEN_EVAL_PROMPT_RAILS_MAP
74
+ )
75
+
76
+
77
+ def phoenix_eval(metrics, openai_api_key, df):
78
+ import os
79
+ os.environ["OPENAI_API_KEY"] = openai_api_key
80
+ model = OpenAIModel(model="gpt-3.5-turbo", temperature=0.25)
81
+
82
+ # Rename columns to match expected input names for evaluation
83
+ df.rename(columns={"question": "input", "answer": "output", "cleaned_context": "reference"}, inplace=True)
84
+
85
+ # Define a dictionary of metric configurations
86
+ metric_mappings = {
87
+ "hallucination": (HALLUCINATION_PROMPT_TEMPLATE, HALLUCINATION_PROMPT_RAILS_MAP, "Hallucination"),
88
+ "toxicity": (TOXICITY_PROMPT_TEMPLATE, TOXICITY_PROMPT_RAILS_MAP, "Toxicity"),
89
+ "relevance": (RAG_RELEVANCY_PROMPT_TEMPLATE, RAG_RELEVANCY_PROMPT_RAILS_MAP, "Relevancy"),
90
+ "Q&A": (QA_PROMPT_TEMPLATE, QA_PROMPT_RAILS_MAP, "Q&A_eval"),
91
+ }
92
+
93
+ # Loop over each metric in the provided metrics list
94
+ for metric in metrics:
95
+ if metric in metric_mappings:
96
+ template, rails_map, column_name = metric_mappings[metric]
97
+ rails = list(rails_map.values())
98
+
99
+ # Perform classification and add results to a new column for the current metric
100
+ classifications = llm_classify(dataframe=df, template=template, model=model, rails=rails, concurrency=20)["label"].tolist()
101
+ df[column_name] = classifications
102
+ else:
103
+ print(f"Warning: Metric '{metric}' is not supported.")
104
+
105
+ # Rename columns back to their original names
106
+ df.rename(columns={"input": "question", "output": "answer", "reference": "context"}, inplace=True)
107
+
108
+ return df