logical-reasoning / llm_toolkit /logical_reasoning_utils.py
inflaton's picture
InternLM 2.5 results
5002792
raw
history blame
6.77 kB
import os
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from llm_toolkit.llm_utils import extract_answer
from tqdm import tqdm
print(f"loading {__file__}")
def calc_metrics(references, predictions, debug=False):
assert len(references) == len(
predictions
), f"lengths are difference: {len(references)} != {len(predictions)}"
predictions = [extract_answer(text) for text in predictions]
correct = [1 if ref == pred else 0 for ref, pred in zip(references, predictions)]
accuracy = sum(correct) / len(references)
results = {"accuracy": accuracy}
if debug:
incorrect_ids = [i for i, c in enumerate(correct) if c == 0]
results["incorrect_ids"] = incorrect_ids
return results
def save_results(model_name, results_path, dataset, predictions, debug=False):
if not os.path.exists(results_path):
# Get the directory part of the file path
dir_path = os.path.dirname(results_path)
# Create all directories in the path (if they don't exist)
os.makedirs(dir_path, exist_ok=True)
df = dataset.to_pandas()
df.drop(columns=["answer", "prompt", "train_text"], inplace=True)
else:
df = pd.read_csv(results_path, on_bad_lines="warn")
df[model_name] = predictions
if debug:
print(df.head(1))
df.to_csv(results_path, index=False)
def load_logical_reasoning_dataset(data_path, tokenizer=None):
train_data_file = data_path + "/train.csv"
test_data_file = data_path + "/dev.csv"
print("loading train/test data files")
datasets = load_dataset(
"csv",
data_files={"train": train_data_file, "test": test_data_file},
)
if tokenizer:
reasoning_prompt = """你是一个逻辑游戏的主持人。游戏规则如下:
1. 参与者会得到一个谜题。
2. 参与者可以通过提问来获取线索,尝试解开谜题。
3. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。
4. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。
5. 参与者需要根据回答来推理,并最终找出谜题的正确答案。
请严格按照这些规则回答参与者提出的问题。
谜题: {}
实际情况: {}
参与者提出的问题: {}
"""
def formatting_prompts_func(examples):
inputs = examples["text"]
outputs = examples["label"]
puzzles = examples["puzzle"]
truths = examples["truth"]
messages = [
{
"role": "system",
"content": "You are an expert in logical reasoning.",
},
None,
]
model_name = os.getenv("MODEL_NAME")
if "mistral" in model_name.lower():
messages = messages[1:]
texts = []
prompts = []
for input, output, puzzle, truth in zip(inputs, outputs, puzzles, truths):
prompt = reasoning_prompt.format(puzzle, truth, input)
messages[-1] = {"role": "user", "content": prompt}
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
prompts.append(prompt)
texts.append(prompt + output + tokenizer.eos_token)
return {"train_text": texts, "prompt": prompts}
datasets = datasets.map(
formatting_prompts_func,
batched=True,
)
print(datasets)
return datasets
def eval_model(model, tokenizer, eval_dataset):
total = len(eval_dataset)
predictions = []
for i in tqdm(range(total)):
inputs = tokenizer(
eval_dataset["prompt"][i : i + 1],
return_tensors="pt",
).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
decoded_output = tokenizer.batch_decode(outputs)
debug = i == 0
decoded_output = [
extract_answer(output, debug=debug) for output in decoded_output
]
predictions.extend(decoded_output)
return predictions
def save_model(
model,
tokenizer,
include_gguf=True,
include_merged=True,
publish=True,
):
try:
token = os.getenv("HF_TOKEN") or None
model_name = os.getenv("MODEL_NAME")
save_method = "lora"
quantization_method = "q5_k_m"
model_names = get_model_names(
model_name, save_method=save_method, quantization_method=quantization_method
)
model.save_pretrained(model_names["local"])
tokenizer.save_pretrained(model_names["local"])
if publish:
model.push_to_hub(
model_names["hub"],
token=token,
)
tokenizer.push_to_hub(
model_names["hub"],
token=token,
)
if include_merged:
model.save_pretrained_merged(
model_names["local"] + "-merged", tokenizer, save_method=save_method
)
if publish:
model.push_to_hub_merged(
model_names["hub"] + "-merged",
tokenizer,
save_method="lora",
token="",
)
if include_gguf:
model.save_pretrained_gguf(
model_names["local-gguf"],
tokenizer,
quantization_method=quantization_method,
)
if publish:
model.push_to_hub_gguf(
model_names["hub-gguf"],
tokenizer,
quantization_method=quantization_method,
token=token,
)
except Exception as e:
print(e)
def get_metrics(df):
metrics_df = pd.DataFrame(df.columns.T)[2:]
metrics_df.rename(columns={0: "model"}, inplace=True)
metrics_df["model"] = metrics_df["model"].apply(lambda x: x.split("/")[-1])
metrics_df.reset_index(inplace=True)
metrics_df = metrics_df.drop(columns=["index"])
accuracy = []
meteor = []
bleu_1 = []
rouge_l = []
all_metrics = []
for col in df.columns[2:]:
metrics = calc_metrics(df["english"], df[col], debug=True)
print(f"{col}: {metrics}")
accuracy.append(metrics["accuracy"])
all_metrics.append(metrics)
metrics_df["accuracy"] = accuracy
metrics_df["all_metrics"] = all_metrics
return metrics_df