|
import argparse |
|
import json |
|
import numpy as np |
|
import tqdm |
|
from pathlib import Path |
|
from pprint import pprint |
|
from collections import defaultdict, Counter |
|
|
|
from transformers import AutoTokenizer |
|
import scrl.utils as utils |
|
from scrl.model import load_checkpoint |
|
from scrl.eval_metrics import compute_token_f1, rouge_scorer, ROUGE_TYPES |
|
from nltk import word_tokenize |
|
|
|
|
|
def get_hc_summary(output): |
|
i = np.argmax(output["scores"]) |
|
summary = output["summaries"][i] |
|
mask = output["masks"][i] |
|
return summary |
|
|
|
|
|
def main(args): |
|
|
|
outputs = list(utils.read_jsonl(args.outputs)) |
|
dataset = list(utils.read_jsonl(args.dataset)) |
|
|
|
all_scores = defaultdict(list) |
|
|
|
for i, item in tqdm.tqdm(enumerate(dataset)): |
|
|
|
src = item["text"] |
|
if args.lower_src: |
|
src = src.lower() |
|
tgts = item["summaries"] |
|
pred = get_hc_summary(outputs[i]) |
|
|
|
if args.max_chars > 0: |
|
pred = pred[:args.max_chars] |
|
|
|
src_tokens = word_tokenize(src) |
|
pred_tokens = word_tokenize(pred) |
|
|
|
if args.lower_summary: |
|
pred_tokens = [t.lower() for t in pred_tokens] |
|
|
|
if args.pretokenized: |
|
src_tokens = src.split() |
|
else: |
|
src_tokens = word_tokenize(src) |
|
|
|
item_scores = defaultdict(list) |
|
for tgt in tgts: |
|
if args.pretokenized: |
|
tgt_tokens = tgt.split() |
|
else: |
|
tgt_tokens = word_tokenize(tgt) |
|
if args.lower_summary: |
|
tgt_tokens = [t.lower() for t in tgt_tokens] |
|
|
|
token_fscore = compute_token_f1(tgt_tokens, pred_tokens, use_counts=True) |
|
|
|
rouge_scores = rouge_scorer.score(tgt, pred) |
|
for rouge_type, rouge_type_scores in rouge_scores.items(): |
|
item_scores[f"{rouge_type}-p"].append(rouge_type_scores.precision) |
|
item_scores[f"{rouge_type}-r"].append(rouge_type_scores.recall) |
|
item_scores[f"{rouge_type}-f"].append(rouge_type_scores.fmeasure) |
|
|
|
item_scores["token-f1"].append(token_fscore) |
|
item_scores["tgt-len"].append(len(tgt_tokens)) |
|
item_scores["tgt-cr"].append(len(tgt_tokens) / len(src_tokens)) |
|
|
|
for k, values in item_scores.items(): |
|
item_mean = np.mean(values) |
|
all_scores[k].append(item_mean) |
|
|
|
all_scores["pred-len"].append(len(pred_tokens)) |
|
all_scores["src-len"].append(len(src_tokens)) |
|
all_scores["pred-cr"].append(len(pred_tokens) / len(src_tokens)) |
|
|
|
if args.verbose: |
|
print("SRC:", src) |
|
print("TGT:", tgts[0]) |
|
print("PRED:", pred) |
|
print("=" * 100) |
|
|
|
print("="*100) |
|
print("RESULTS:") |
|
|
|
print("="*20, "Length (#tokens):", "="*20) |
|
for metric in ("src-len", "tgt-len", "pred-len"): |
|
mean = np.mean(all_scores[metric]) |
|
print(f"{metric}: {mean:.2f}") |
|
print() |
|
|
|
print("="*20, "Compression ratio:", "="*20) |
|
for metric in ("tgt-cr", "pred-cr"): |
|
mean = np.mean(all_scores[metric]) |
|
print(f"{metric}: {mean:.2f}") |
|
print() |
|
|
|
print("="*20, "Token F1-Score:", "="*20) |
|
mean = np.mean(all_scores["token-f1"]) |
|
print(f"f1-score: {mean:.3f}") |
|
print() |
|
|
|
print("="*20, "ROUGE F1-Scores:", "="*20) |
|
for rouge_type in ROUGE_TYPES: |
|
mean = np.mean(all_scores[f"{rouge_type}-f"]) |
|
print(f"{rouge_type}: {mean:.4f}") |
|
print() |
|
|
|
print("="*20, "ROUGE Recall:", "="*20) |
|
for rouge_type in ROUGE_TYPES: |
|
mean = np.mean(all_scores[f"{rouge_type}-r"]) |
|
print(f"{rouge_type}: {mean:.4f}") |
|
print() |
|
|
|
def parse_args(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--dataset', required=True) |
|
parser.add_argument('--outputs', required=True) |
|
parser.add_argument('--pretokenized', action="store_true") |
|
parser.add_argument('--max-chars', type=int, default=-1) |
|
parser.add_argument('--verbose', action="store_true") |
|
parser.add_argument('--lower-src', action="store_true") |
|
parser.add_argument('--lower-summary', action="store_true") |
|
return parser.parse_args() |
|
|
|
|
|
if __name__ == '__main__': |
|
main(parse_args()) |
|
|