Martín Santillán Cooper commited on
Commit
b022d45
1 Parent(s): 2f382a0

Update result messages

Browse files
Files changed (3) hide show
  1. app.py +2 -4
  2. model.py +1 -1
  3. utils.py +16 -1
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  from dotenv import load_dotenv
3
 
4
- from utils import get_evaluated_component, get_evaluated_component_adjective, to_title_case, get_prompt_from_test_case, to_snake_case
5
  load_dotenv()
6
  import json
7
  from model import generate_text
@@ -49,12 +49,10 @@ def on_submit(criteria, context, user_message, assistant_message, state):
49
  'user_message': user_message,
50
  'assistant_message': assistant_message,
51
  }, state['selected_sub_catalog'])
52
- evaluated_component = get_evaluated_component(state['selected_sub_catalog'], state['selected_criteria_name'])
53
- evaluated_component_adjective = get_evaluated_component_adjective(state['selected_sub_catalog'], state['selected_criteria_name'])
54
  logger.debug(f"Starting evaluation for subcatelog {state['selected_sub_catalog']} and criteria name {state['selected_criteria_name']}")
55
  result_label = generate_text(prompt)['assessment'] # Yes or No
56
  # result_str = f"Is the {evaluated_component} message harmful based on the risk definition: {result_label}"
57
- html_str = f"<p>Is the {evaluated_component} {evaluated_component_adjective}: <strong>{result_label}</strong></p>"
58
  # html_str = f"Is the {evaluated_component} {evaluated_component_adjective}: {result_label}"
59
  return [gr.update(value=html_str), gr.update(visible=True), gr.update(visible=True)]
60
 
 
1
  import gradio as gr
2
  from dotenv import load_dotenv
3
 
4
+ from utils import get_evaluated_component, get_evaluated_component_adjective, get_result_description, to_title_case, get_prompt_from_test_case, to_snake_case
5
  load_dotenv()
6
  import json
7
  from model import generate_text
 
49
  'user_message': user_message,
50
  'assistant_message': assistant_message,
51
  }, state['selected_sub_catalog'])
 
 
52
  logger.debug(f"Starting evaluation for subcatelog {state['selected_sub_catalog']} and criteria name {state['selected_criteria_name']}")
53
  result_label = generate_text(prompt)['assessment'] # Yes or No
54
  # result_str = f"Is the {evaluated_component} message harmful based on the risk definition: {result_label}"
55
+ html_str = f"<p>{get_result_description(state['selected_sub_catalog'], state['selected_criteria_name'])} <strong>{result_label}</strong></p>"
56
  # html_str = f"Is the {evaluated_component} {evaluated_component_adjective}: {result_label}"
57
  return [gr.update(value=html_str), gr.update(visible=True), gr.update(visible=True)]
58
 
model.py CHANGED
@@ -58,7 +58,7 @@ def generate_text(prompt):
58
  mock_model_call = os.getenv('MOCK_MODEL_CALL') == 'true'
59
  if mock_model_call:
60
  logger.debug('Returning mocked model result.')
61
- sleep(2)
62
  return {'assessment': 'Yes', 'certainty': 0.97}
63
  else:
64
  start = time()
 
58
  mock_model_call = os.getenv('MOCK_MODEL_CALL') == 'true'
59
  if mock_model_call:
60
  logger.debug('Returning mocked model result.')
61
+ sleep(1)
62
  return {'assessment': 'Yes', 'certainty': 0.97}
63
  else:
64
  start = time()
utils.py CHANGED
@@ -20,6 +20,21 @@ def get_prompt_template(test_case, sub_catalog_name):
20
  def get_prompt_from_test_case(test_case, sub_catalog_name):
21
  return assessment_prompt(Template(get_prompt_template(test_case, sub_catalog_name)).render(**test_case))
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def get_evaluated_component(sub_catalog_name, criteria_name):
24
  if sub_catalog_name == 'harmful_content_in_user_message':
25
  component = "user"
@@ -32,7 +47,7 @@ def get_evaluated_component(sub_catalog_name, criteria_name):
32
  component = "assistant"
33
  elif criteria_name == "answer_relevance":
34
  component = "assistant"
35
- return f"'{to_title_case(component)} message'"
36
 
37
  def get_evaluated_component_adjective(sub_catalog_name, criteria_name):
38
  if criteria_name == 'context_relevance' or criteria_name == 'answer_relevance':
 
20
  def get_prompt_from_test_case(test_case, sub_catalog_name):
21
  return assessment_prompt(Template(get_prompt_template(test_case, sub_catalog_name)).render(**test_case))
22
 
23
+ def get_result_description(sub_catalog_name, criteria_name):
24
+ evaluated_component = get_evaluated_component(sub_catalog_name, criteria_name)
25
+ messages = {
26
+ "general_harm": f"Is the {evaluated_component} message harmful by common-sense?",
27
+ "bias": f"Is the {evaluated_component} message prejudiced based on identity or characteristics?",
28
+ "jailbreak": f"Does the user message try to manipulate the AI to generate harmful, undesired, or inappropriate content?",
29
+ "violence": f"Does the {evaluated_component} message contain content promoting physical, mental, or sexual harm?",
30
+ "profanity": f"Does the {evaluated_component} message include offensive language or insults?",
31
+ "unethical_behavior": f"Does the {evaluated_component} message include actions that violate moral or legal standards?",
32
+ "answer_relevance": f"Does the assistant response fail to address or properly answer the user question?",
33
+ "context_relevance": f"Is the retrieved context irrelevant to the user question or does not address their needs?",
34
+ "groundedness": f"Does the assistant response include claims or facts not supported by or contradicted by the provided context?"
35
+ }
36
+ return messages[criteria_name]
37
+
38
  def get_evaluated_component(sub_catalog_name, criteria_name):
39
  if sub_catalog_name == 'harmful_content_in_user_message':
40
  component = "user"
 
47
  component = "assistant"
48
  elif criteria_name == "answer_relevance":
49
  component = "assistant"
50
+ return component
51
 
52
  def get_evaluated_component_adjective(sub_catalog_name, criteria_name):
53
  if criteria_name == 'context_relevance' or criteria_name == 'answer_relevance':