Spaces:
Build error
Build error
import json | |
import logging | |
from openai import OpenAI | |
from .lm_utils import run_chatgpt_query_multi_turn | |
from .openai_helpers import get_response | |
logging.basicConfig( | |
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', | |
datefmt='%m/%d/%Y %H:%M:%S', | |
level=logging.INFO, | |
) | |
logger = logging.getLogger(__name__) | |
def get_score_from_answer(type, answer): | |
if type == 'context': | |
answer = answer.replace('Answer:', '').strip() | |
if answer.startswith('A)'): | |
return 1.0 | |
elif answer.startswith('B)'): | |
return 0.0 | |
return -1.0 | |
elif type == 'var': | |
try: | |
var_json = json.loads(answer) | |
# print(f"var_json:{var_json}") | |
p = 0.0 | |
r = 0.0 | |
f1 = 0.0 | |
if var_json['sizeB']: | |
p = var_json['intersection'] / var_json['sizeB'] | |
if var_json['sizeA']: | |
r = var_json['intersection'] / var_json['sizeA'] | |
if p > 0.0 and r > 0.0: | |
f1 = (2 * p * r) / (p + r) | |
else: | |
f1 = 0.0 | |
eval_rec = { | |
'p': p, | |
'r': r, | |
'f1': f1, | |
'sizeA': var_json['sizeA'], | |
'sizeB': var_json['sizeB'], | |
'intersection': var_json['intersection'], | |
'explanation': var_json['explanation'], | |
} | |
print(f'var_eval: {eval_rec}') | |
return eval_rec | |
except Exception: # COMMENT: added Exception | |
return {'p': -1.0, 'r': -1.0, 'f1': -1.0} | |
elif type == 'rel': | |
print(answer) | |
rel_json = json.loads(answer) | |
answer_str = rel_json['answer'].strip() | |
if answer_str.startswith('A') or 'very similar' in answer_str: | |
return 1.0 | |
elif ( | |
answer_str.startswith('B') or 'similar but general than HypoA' in answer_str | |
): | |
return 0.5 | |
elif answer_str.startswith('C') or 'different' in answer_str: | |
return 0.0 | |
return -1.0 | |
return -1.0 | |
def ask_dimension_question( | |
query, | |
gold_hypo, | |
gold_workflow, | |
gen_hypo, | |
gen_workflow, | |
dataset_meta, | |
llm_used, | |
dimension, | |
dataset_type, | |
use_column_metadata=True, | |
): | |
dimension_question = '' | |
answer = '' | |
score = 0.0 | |
if dimension == 'var': | |
score = {'p': -1.0, 'r': -1.0, 'f1': -1.0} | |
num_tokens = 256 | |
num_retries = 1 | |
json_response = False | |
messages = [ | |
{ | |
'role': 'system', | |
'content': 'You are an AI assistant that helps evaluate a data-driven hypothesis. You are a helpful assistant who is not talkative. You only respond with the exact answer to a query without additional conversation.', | |
}, | |
] | |
if dimension == 'context': | |
dimension_question = """\ | |
Question: Is HypoB defined in the same context as HypoA? | |
(Context refers to assumptions/stratification under which the hypotheses are defined.) | |
Options: A) same B) different | |
What is your answer?""" | |
elif dimension == 'var': | |
dimension_question = """\ | |
Question: For both HypoA and HypoB, what are the different variables found in the hypotheses? \ | |
Return your answer as a JSON object in the following format: | |
```json | |
{{ | |
"sizeA": num of variables used in HypoA | |
"sizeB": num of variables used in HypoB | |
"intersection": num of variables common in HypoA and HypoB. Use *fuzzy matching* to determine intersection, accounting for paraphrases or slightly different surface forms | |
"explanation": a short text explanation about the variables | |
}}``` | |
Answer:""" | |
num_tokens = 512 | |
num_retries = 1 | |
json_response = True | |
elif dimension == 'rel': | |
dimension_question = """\ | |
Question: Does HypoB exhibit the same relation as HypoA? | |
Compare using following example hierarchy of relationships (based on specificity): \ | |
"there exists a relationship" > "positive relationship" > "positive AND (linear OR quadratic)" > "positive AND linear". | |
Options: A) very similar B) similar but general than HypoA C) different | |
Return your answer as a JSON object in the following format: | |
```json | |
{{ | |
"answer": one of the options from A) very similar B) similar but general than HypoA C) different | |
"explanation": a short text explanation about the relationship comparison | |
}}``` | |
Answer:""" | |
num_tokens = 512 | |
num_retries = 1 | |
json_response = True | |
datasets_json = prepare_dataset_metadata_json( | |
dataset_meta, dataset_type=dataset_type, use_column_metadata=use_column_metadata | |
) | |
dimension_question_str = f"""\ | |
You are going to compare two natural-language hypotheses HypoA and HypoB accompanied with optional workflows: WorkflowA for HypoA and WorkflowB for HypoB. \ | |
Both the hypotheses answer the natural language query "QUERY" over the dataset(s) described by dataset description(s) and column description(s) below. \ | |
Compare HypoA and HypoB in terms of three aspects: Contexts, Variables, and Relations. \ | |
E.g., for the hypothesis "From 1995 to 2009, the number of sandhill cranes around the tundra (Indigilka River) surged by an astounding ~10X": | |
* Contexts refer to stratification of the data under which the given hypothesis is True. E.g., "For all women", "From 1995 to 2009". | |
* Variables refer to the set of variables (either dependent or independent) that are mentioned in the hypothesis. E.g., number of sandhill cranes, location. | |
* Relations refer to the form of relation between the variables. E.g., "surged by ~10x". | |
Answer following questions for a given pair of hypotheses, HypoA and HypoB, along with an explanation grounded on the QUERY and the DATASET(S). | |
Here is the metadata for the task: | |
```json | |
{{ | |
"datasets": {datasets_json}, | |
"query": {query}, | |
"HypoA": {gold_hypo}, | |
"WorkflowA": {gold_workflow}, | |
"HypoB": {gen_hypo}, | |
"WorkflowB": {gen_workflow} | |
}} | |
``` | |
{dimension_question}""" | |
messages.append({'role': 'user', 'content': dimension_question_str}) | |
for retry in range(num_retries): | |
response = run_chatgpt_query_multi_turn( | |
messages=messages, | |
model_name=llm_used, | |
max_tokens=num_tokens, | |
temperature=0, # 0 for greedy best decoding | |
json_response=json_response, | |
) | |
if response is not None: # COMMENT: changed from != to is not | |
break | |
if response is not None: # COMMENT: changed from != to is not | |
answer = response.choices[0].message.content.strip() | |
score = get_score_from_answer(type=dimension, answer=answer) | |
return dimension_question, answer, score | |
def prepare_dataset_metadata_json(dataset_meta, dataset_type, use_column_metadata=True): | |
if dataset_meta is None: # COMMENT: changed from == to is None | |
return [ | |
{ | |
'dataset_description': '', | |
'columns': [], | |
} | |
] | |
datasets_json = [] | |
if dataset_type == 'real': | |
for d in dataset_meta['datasets']: | |
datasets_json.append( | |
{ | |
'dataset_description': d['description'], | |
'columns': [ | |
{'name': col['name'], 'description': col['description']} | |
for col in d['columns']['raw'] | |
] | |
if use_column_metadata | |
else [], | |
} | |
) | |
else: | |
for d in dataset_meta['datasets']: | |
datasets_json.append( | |
{ | |
'dataset_description': d['description'], | |
'columns': [ | |
{'name': col['name'], 'description': col['description']} | |
for col in d['columns'] | |
] | |
if use_column_metadata | |
else [], | |
} | |
) | |
return datasets_json | |
def get_sub_hypotheses( | |
query, | |
hypo, | |
workflow, | |
dataset_meta, | |
llm_used, | |
dataset_type, | |
use_column_metadata=True, | |
): | |
client = OpenAI() | |
extraction_prompt = """\ | |
Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract three dimensions that define the hypothesis: Context, Variables, and Relations. \ | |
Here are the definitions for these dimensions: | |
- Contexts: Boundary conditions that limit the scope of a hypothesis. E.g., “for men over \ | |
the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then extract the context from the dataset_descrption. | |
- Variables: Known concepts that interact in a meaningful way under a given context to \ | |
produce the hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable. | |
- Relations: Interactions between a given set of variables under a given context to produce \ | |
the hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \ | |
or "None" if there is no interacting relationship. | |
Make sure to only use the information present in the hypothesis and the workflow. Do not add any new information. \ | |
For each dimension, be specific, and do not omit any important details. | |
Here is the metadata for the task: | |
```json | |
{ | |
"datasets": %s, | |
"hypothesis": "%s", | |
"workflow": "%s" | |
} | |
``` | |
Return your answer as a JSON object in the following format: | |
```json | |
{ | |
"sub_hypo": [ | |
{ | |
"text": the hypothesis in natural language, | |
"context": a short text description of the context of the hypothesis, | |
"variables": a list of columns involved in the hypothesis, | |
"relations": a short text description of the relationship between the variables of the hypothesis | |
}, | |
... | |
] | |
}``` | |
""" | |
datasets_json = prepare_dataset_metadata_json( | |
dataset_meta, dataset_type, use_column_metadata=use_column_metadata | |
) | |
_prompt = extraction_prompt % (datasets_json, hypo, workflow) | |
sub_hypo_json = get_response(client, _prompt, model=llm_used, max_retry=1) | |
if sub_hypo_json is not None: # COMMENT: changed from != to is not | |
# print(f"full hypothesis: {hypo}") | |
print(f'sub_hypo_json: {sub_hypo_json}') | |
else: | |
sub_hypo_json = { | |
'sub_hypo': [], | |
} | |
sub_hypo_json['full_hypo'] = hypo | |
return sub_hypo_json | |
def match_context_with_gpt( | |
gold_hyp, gold_context, pred_hyp, pred_context, model='gpt-3.5-turbo' | |
): | |
prompt = f"""\ | |
Given a gold hypothesis, a gold context, a predicted hypothesis, and a predicted context, your task is \ | |
to determine if the predicted context semantically matches the ground-truth context. \ | |
Here is the definition for Context: Boundary conditions that limit the scope of a sub-hypothesis. E.g., “for men over the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then the context is derived from the dataset_descrption. \ | |
Here is the definition for Context: Boundary conditions that limit the scope of a sub-hypothesis. E.g., “for men over the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then the context is derived from the dataset_descrption. \ | |
If the predicted context matches the gold context, return true, otherwise return false. | |
If both gold and predicted hypotheses are defined over the context of the full dataset, then also return true. | |
If both gold and predicted hypotheses are defined over the context of the full dataset, then also return true. | |
Here is the metadata for the task: | |
```json | |
{{ | |
"gold_hypothesis": "{gold_hyp}", | |
"gold_context": "{gold_context}", | |
"predicted_hypothesis": "{pred_hyp}", | |
"predicted_context": "{pred_context}" | |
}} | |
``` | |
Return your answer as a JSON object in the following format: | |
```json | |
{{ | |
"match": true or false | |
}} | |
```""" | |
client = OpenAI() | |
output = get_response(client, prompt, model=model) | |
return output.get('match', False) | |
def is_matching_context(gold_hyp, gold_context, pred_hyp, pred_context, llm_used): | |
if gold_context == pred_context: | |
return True | |
if 'None' in [gold_context, pred_context]: | |
return False | |
return match_context_with_gpt( | |
gold_hyp, gold_context, pred_hyp, pred_context, model=llm_used | |
) | |
def run_eval_gold_vs_gen_NL_subhypo( | |
query, | |
gold_hypo, | |
gold_workflow, | |
gen_hypo, | |
gen_workflow, | |
dataset_meta, | |
llm_used, | |
context_score, | |
dataset_type, | |
use_column_metadata=True, | |
): | |
# GPT-4 based evaluation to evaluate generated hypothesis in terms of context, variables, relation | |
eval_rec = { | |
'query': query, | |
'HypoA': gold_hypo, | |
'WorkflowA': gold_workflow, | |
'HypoB': gen_hypo, | |
'WorkflowB': gen_workflow, | |
} | |
for dimension in ['var', 'rel']: | |
question, answer, score = ask_dimension_question( | |
query, | |
gold_hypo, | |
gold_workflow, | |
gen_hypo, | |
gen_workflow, | |
dataset_meta, | |
llm_used, | |
dimension=dimension, | |
dataset_type=dataset_type, | |
use_column_metadata=use_column_metadata, | |
) | |
eval_rec[dimension] = {'question': question, 'answer': answer, 'score': score} | |
eval_rec['context'] = context_score | |
eval_rec['accuracy_score'] = ( | |
1.0 | |
* eval_rec['context']['score'] | |
* eval_rec['var']['score']['f1'] | |
* eval_rec['rel']['score'] | |
) | |
return eval_rec | |
def run_eval_gold_vs_gen_NL_hypo_workflow( | |
query, | |
gold_hypo, | |
gold_workflow, | |
gen_hypo, | |
gen_workflow, | |
dataset_meta, | |
llm_used, | |
dataset_type, | |
use_column_metadata=True, | |
): | |
# Input: Dataset Metadata, Query, Gold {Hg, Wg}, Predicted {Hp, Wp} | |
# Output: eval_rec json includes final_score | |
# Procedure: | |
# Dataset Metadata, Query, Gold {Hg, Wg}, Pred {Hg, Wg} | |
# Gold: [Hg1, Hg2] (compute on the fly) Hg1 is a NL form of subhypothesis | |
# Predicted: [Hp1, Hp2] (compute on the fly) | |
# Compute Intersection: [(Hg_i, Hp_j), …] # tuples of (gold,pred) that matched with context (do this w/o explicit extraction) | |
# # filter so that a gold context and a predicted context are only attached to one tuple | |
# Compute recall_context (programmatically) | |
# r_v_list = [] | |
# For (Hg_i, Hp_j) in the intersection: | |
# With Hg_i, Hp_j in NL, ask GPT4 → #variables and #intersection and a paragraph explanation and programmatically calculate f1_v | |
# Hg_i, Hp_j in NL, ask GPT4 → matching score (0, 0.5 or 1) : A) very similar B) similar but general than HypoA C) different + explanation | |
# r_v_list ← f1_v * score_r | |
# accuracy_score = mean(r_v_list) | |
# score = [ recall_context * mean over predicted context(context_score * var_score *rel_score )] | |
# recall_context = 1.0 # COMMENT: never used | |
eval_rec = { | |
'query': query, | |
'HypoA': gold_hypo, | |
'WorkflowA': gold_workflow, | |
'HypoB': gen_hypo, | |
'WorkflowB': gen_workflow, | |
} | |
gold_sub_hypo_json = get_sub_hypotheses( | |
query=query, | |
hypo=gold_hypo, | |
workflow=gold_workflow, | |
dataset_meta=dataset_meta, | |
llm_used=llm_used, | |
dataset_type=dataset_type, | |
use_column_metadata=use_column_metadata, | |
) | |
if len(gold_sub_hypo_json['sub_hypo']) == 0: | |
gold_sub_hypo_json['sub_hypo'] = [ | |
{ | |
'text': gold_hypo, | |
'context': 'None', | |
'variables': [], | |
'relations': '', | |
'explanation': 'unable to segment', | |
} | |
] | |
print(f'gold_sub_hypo_json: {gold_sub_hypo_json}') | |
gen_sub_hypo_json = get_sub_hypotheses( | |
query=query, | |
hypo=gen_hypo, | |
workflow=gen_workflow, | |
dataset_meta=dataset_meta, | |
llm_used=llm_used, | |
dataset_type=dataset_type, | |
use_column_metadata=use_column_metadata, | |
) | |
if len(gen_sub_hypo_json['sub_hypo']) == 0: | |
gen_sub_hypo_json['sub_hypo'] = [ | |
{ | |
'text': gen_hypo, | |
'context': 'None', | |
'variables': [], | |
'relations': '', | |
'explanation': 'unable to segment', | |
} | |
] | |
print(f'gen_sub_hypo_json: {gen_sub_hypo_json}') | |
eval_rec['gold_sub_hypo'] = gold_sub_hypo_json | |
eval_rec['gen_sub_hypo'] = gen_sub_hypo_json | |
gold_subh_covered = [] | |
gen_subh_to_gold_subh = dict() | |
gen_gold_subh_to_context = dict() | |
for p_id, gen_subh in enumerate(gen_sub_hypo_json['sub_hypo']): | |
gen_subh_to_gold_subh[p_id] = -1 | |
for g_id, gold_subh in enumerate(gold_sub_hypo_json['sub_hypo']): | |
if g_id in gold_subh_covered: | |
continue | |
# match context | |
context_bool = is_matching_context( | |
gold_subh['text'], | |
gold_subh.get('context', ''), | |
gen_subh['text'], | |
gen_subh.get('context', ''), | |
llm_used, | |
) | |
if context_bool: | |
context_score = 1.0 | |
else: | |
context_score = 0.0 | |
if context_score == 1.0: # match only when context_score = 1.0 | |
gen_subh_to_gold_subh[p_id] = g_id | |
gold_subh_covered.append(g_id) | |
gen_gold_subh_to_context[f'P{p_id}||G{g_id}'] = { | |
'question': f"""Comapring: GoldH: {gold_subh['text']}, GoldC: {gold_subh['context']}\nGenH: {gen_subh['text']}, GenC: {gen_subh['context']}""", | |
'answer': context_bool, | |
'score': context_score, | |
} | |
break | |
print(f'gen_subh_to_gold_subh: {gen_subh_to_gold_subh}') | |
eval_rec['gen_subh_to_gold_subh'] = gen_subh_to_gold_subh | |
eval_rec['gold_subh_covered'] = gold_subh_covered | |
matched_gold_gen_subh_evals = dict() | |
sum_accuracy_score = 0.0 | |
for p_id, g_id in gen_subh_to_gold_subh.items(): | |
if g_id >= 0: | |
key = f'P{p_id}||G{g_id}' | |
context_score = gen_gold_subh_to_context[key] | |
subh_eval_rec = run_eval_gold_vs_gen_NL_subhypo( | |
query, | |
gold_hypo, | |
gold_workflow, | |
gen_hypo, | |
gen_workflow, | |
dataset_meta, | |
llm_used, | |
context_score, | |
dataset_type=dataset_type, | |
use_column_metadata=use_column_metadata, | |
) | |
sum_accuracy_score += subh_eval_rec['accuracy_score'] | |
matched_gold_gen_subh_evals[key] = subh_eval_rec | |
eval_rec['matched_gold_gen_subh_evals'] = matched_gold_gen_subh_evals | |
eval_rec['recall_context'] = ( | |
len(gold_subh_covered) / len(gold_sub_hypo_json['sub_hypo']) | |
if len(gold_sub_hypo_json['sub_hypo']) | |
else 0.0 | |
) | |
mean_accuracy_score = ( | |
sum_accuracy_score / len(gen_subh_to_gold_subh) | |
if len(gen_subh_to_gold_subh) | |
else 0.0 | |
) | |
eval_rec['mean_accuracy_score'] = mean_accuracy_score | |
final_score = eval_rec['recall_context'] * mean_accuracy_score | |
eval_rec['final_score'] = final_score | |
print(f'eval_rec: {json.dumps(eval_rec, indent=2)}') | |
return eval_rec | |