|
import json |
|
|
|
from assistants import GPTAgent |
|
import json_repair |
|
|
|
class evaluator: |
|
def __init__(self, model_name='GPT4-turbo'): |
|
self.model = GPTAgent(model_name) |
|
|
|
def validate_scores(self, scores): |
|
required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"] |
|
for key in required_keys: |
|
if key not in scores or not isinstance(scores[key], (int, float)) or not (0 <= scores[key] <= 1): |
|
raise ValueError(f"Score for '{key}' is missing or out of range. Received: {scores.get(key)}") |
|
return scores |
|
|
|
def __call__(self, question,explanation): |
|
|
|
evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by |
|
an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle |
|
should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, |
|
and 1 indicates that the principle is fully satisfied. |
|
|
|
Question: |
|
{question} |
|
|
|
Provided Explanation: |
|
{explanation} |
|
|
|
Evaluation Criteria: |
|
|
|
Factually Correct: |
|
Definition: The explanation must be accurate and relevant to the question and the subject matter. |
|
Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question. |
|
|
|
Useful: |
|
Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making. |
|
Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions? |
|
|
|
Context Specific: |
|
Definition: The explanation should be relevant to the specific context or scenario implied by the question. |
|
Score: (0-1) How well does the explanation address the specific context or scenario of the question? |
|
|
|
User Specific: |
|
Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics. |
|
Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user? |
|
|
|
Provides Pluralism: |
|
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives. |
|
Score: (0-1) How well does the explanation provide or support multiple perspectives? |
|
|
|
After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. |
|
|
|
Example JSON format: |
|
|
|
{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}} |
|
|
|
Answer: |
|
""" |
|
|
|
|
|
response = """ {{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}""" |
|
|
|
try: |
|
scores = json.loads(response) |
|
print(scores) |
|
except json.JSONDecodeError: |
|
|
|
repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False) |
|
try: |
|
scores = json.loads(repaired_json) |
|
except json.JSONDecodeError: |
|
print("Failed to decode JSON response even after repair attempt. Skipping this batch.") |
|
return None |
|
|
|
return self.validate_scores(scores) |
|
|
|
if __name__ == '__main__': |
|
eval = evaluator() |
|
question = "What is the capital of France?" |
|
explanation = "The capital of France is Paris." |
|
print(eval(question, explanation)) |