Spaces:
Build error
Build error
File size: 1,712 Bytes
c47212f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
from query_data import query_rag
from langchain_community.llms.ollama import Ollama
EVAL_PROMPT = """
Expected Response: {expected_response}
Actual Response: {actual_response}
---
(Answer with 'true' or 'false') Does the actual response match the expected response?
"""
def test_monopoly_rules():
assert query_and_validate(
question="How much total money does a player start with in Monopoly? (Answer with the number only)",
expected_response="$1500",
)
def test_ticket_to_ride_rules():
assert query_and_validate(
question="How many points does the longest continuous train get in Ticket to Ride? (Answer with the number only)",
expected_response="10 points",
)
def query_and_validate(question: str, expected_response: str):
response_text = query_rag(question)
prompt = EVAL_PROMPT.format(
expected_response=expected_response, actual_response=response_text
)
model = Ollama(model="mistral")
evaluation_results_str = model.invoke(prompt)
evaluation_results_str_cleaned = evaluation_results_str.strip().lower()
print(prompt)
if "true" in evaluation_results_str_cleaned:
# Print response in Green if it is correct.
print("\033[92m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
return True
elif "false" in evaluation_results_str_cleaned:
# Print response in Red if it is incorrect.
print("\033[91m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
return False
else:
raise ValueError(
f"Invalid evaluation result. Cannot determine if 'true' or 'false'."
)
|