Spaces:
Running
on
Zero
Running
on
Zero
Martín Santillán Cooper
commited on
Commit
•
b022d45
1
Parent(s):
2f382a0
Update result messages
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
from dotenv import load_dotenv
|
3 |
|
4 |
-
from utils import get_evaluated_component, get_evaluated_component_adjective, to_title_case, get_prompt_from_test_case, to_snake_case
|
5 |
load_dotenv()
|
6 |
import json
|
7 |
from model import generate_text
|
@@ -49,12 +49,10 @@ def on_submit(criteria, context, user_message, assistant_message, state):
|
|
49 |
'user_message': user_message,
|
50 |
'assistant_message': assistant_message,
|
51 |
}, state['selected_sub_catalog'])
|
52 |
-
evaluated_component = get_evaluated_component(state['selected_sub_catalog'], state['selected_criteria_name'])
|
53 |
-
evaluated_component_adjective = get_evaluated_component_adjective(state['selected_sub_catalog'], state['selected_criteria_name'])
|
54 |
logger.debug(f"Starting evaluation for subcatelog {state['selected_sub_catalog']} and criteria name {state['selected_criteria_name']}")
|
55 |
result_label = generate_text(prompt)['assessment'] # Yes or No
|
56 |
# result_str = f"Is the {evaluated_component} message harmful based on the risk definition: {result_label}"
|
57 |
-
html_str = f"<p>
|
58 |
# html_str = f"Is the {evaluated_component} {evaluated_component_adjective}: {result_label}"
|
59 |
return [gr.update(value=html_str), gr.update(visible=True), gr.update(visible=True)]
|
60 |
|
|
|
1 |
import gradio as gr
|
2 |
from dotenv import load_dotenv
|
3 |
|
4 |
+
from utils import get_evaluated_component, get_evaluated_component_adjective, get_result_description, to_title_case, get_prompt_from_test_case, to_snake_case
|
5 |
load_dotenv()
|
6 |
import json
|
7 |
from model import generate_text
|
|
|
49 |
'user_message': user_message,
|
50 |
'assistant_message': assistant_message,
|
51 |
}, state['selected_sub_catalog'])
|
|
|
|
|
52 |
logger.debug(f"Starting evaluation for subcatelog {state['selected_sub_catalog']} and criteria name {state['selected_criteria_name']}")
|
53 |
result_label = generate_text(prompt)['assessment'] # Yes or No
|
54 |
# result_str = f"Is the {evaluated_component} message harmful based on the risk definition: {result_label}"
|
55 |
+
html_str = f"<p>{get_result_description(state['selected_sub_catalog'], state['selected_criteria_name'])} <strong>{result_label}</strong></p>"
|
56 |
# html_str = f"Is the {evaluated_component} {evaluated_component_adjective}: {result_label}"
|
57 |
return [gr.update(value=html_str), gr.update(visible=True), gr.update(visible=True)]
|
58 |
|
model.py
CHANGED
@@ -58,7 +58,7 @@ def generate_text(prompt):
|
|
58 |
mock_model_call = os.getenv('MOCK_MODEL_CALL') == 'true'
|
59 |
if mock_model_call:
|
60 |
logger.debug('Returning mocked model result.')
|
61 |
-
sleep(
|
62 |
return {'assessment': 'Yes', 'certainty': 0.97}
|
63 |
else:
|
64 |
start = time()
|
|
|
58 |
mock_model_call = os.getenv('MOCK_MODEL_CALL') == 'true'
|
59 |
if mock_model_call:
|
60 |
logger.debug('Returning mocked model result.')
|
61 |
+
sleep(1)
|
62 |
return {'assessment': 'Yes', 'certainty': 0.97}
|
63 |
else:
|
64 |
start = time()
|
utils.py
CHANGED
@@ -20,6 +20,21 @@ def get_prompt_template(test_case, sub_catalog_name):
|
|
20 |
def get_prompt_from_test_case(test_case, sub_catalog_name):
|
21 |
return assessment_prompt(Template(get_prompt_template(test_case, sub_catalog_name)).render(**test_case))
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def get_evaluated_component(sub_catalog_name, criteria_name):
|
24 |
if sub_catalog_name == 'harmful_content_in_user_message':
|
25 |
component = "user"
|
@@ -32,7 +47,7 @@ def get_evaluated_component(sub_catalog_name, criteria_name):
|
|
32 |
component = "assistant"
|
33 |
elif criteria_name == "answer_relevance":
|
34 |
component = "assistant"
|
35 |
-
return
|
36 |
|
37 |
def get_evaluated_component_adjective(sub_catalog_name, criteria_name):
|
38 |
if criteria_name == 'context_relevance' or criteria_name == 'answer_relevance':
|
|
|
20 |
def get_prompt_from_test_case(test_case, sub_catalog_name):
|
21 |
return assessment_prompt(Template(get_prompt_template(test_case, sub_catalog_name)).render(**test_case))
|
22 |
|
23 |
+
def get_result_description(sub_catalog_name, criteria_name):
|
24 |
+
evaluated_component = get_evaluated_component(sub_catalog_name, criteria_name)
|
25 |
+
messages = {
|
26 |
+
"general_harm": f"Is the {evaluated_component} message harmful by common-sense?",
|
27 |
+
"bias": f"Is the {evaluated_component} message prejudiced based on identity or characteristics?",
|
28 |
+
"jailbreak": f"Does the user message try to manipulate the AI to generate harmful, undesired, or inappropriate content?",
|
29 |
+
"violence": f"Does the {evaluated_component} message contain content promoting physical, mental, or sexual harm?",
|
30 |
+
"profanity": f"Does the {evaluated_component} message include offensive language or insults?",
|
31 |
+
"unethical_behavior": f"Does the {evaluated_component} message include actions that violate moral or legal standards?",
|
32 |
+
"answer_relevance": f"Does the assistant response fail to address or properly answer the user question?",
|
33 |
+
"context_relevance": f"Is the retrieved context irrelevant to the user question or does not address their needs?",
|
34 |
+
"groundedness": f"Does the assistant response include claims or facts not supported by or contradicted by the provided context?"
|
35 |
+
}
|
36 |
+
return messages[criteria_name]
|
37 |
+
|
38 |
def get_evaluated_component(sub_catalog_name, criteria_name):
|
39 |
if sub_catalog_name == 'harmful_content_in_user_message':
|
40 |
component = "user"
|
|
|
47 |
component = "assistant"
|
48 |
elif criteria_name == "answer_relevance":
|
49 |
component = "assistant"
|
50 |
+
return component
|
51 |
|
52 |
def get_evaluated_component_adjective(sub_catalog_name, criteria_name):
|
53 |
if criteria_name == 'context_relevance' or criteria_name == 'answer_relevance':
|