File size: 5,475 Bytes
e17c9f2 b6336ac e17c9f2 c8709b2 e17c9f2 69e60be e17c9f2 c8709b2 e17c9f2 02069d7 e17c9f2 de0c71d e17c9f2 de0c71d e17c9f2 de0c71d e17c9f2 de0c71d e17c9f2 c8709b2 e17c9f2 de0c71d e17c9f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import os
from utils.paper_retriever import RetrieverFactory
from utils.llms_api import APIHelper
from utils.paper_client import PaperClient
from utils.header import ConfigReader
from omegaconf import OmegaConf
import click
import json
from loguru import logger
import warnings
from utils.hash import check_env, check_embedding
warnings.filterwarnings("ignore")
@click.group()
@click.pass_context
def main(ctx):
"""
Evaluate Retriever SN/KG/SNKG
"""
print("Mode:", ctx.invoked_subcommand)
@main.command(context_settings=dict(
ignore_unknown_options=True,
allow_extra_args=True,
))
@click.option(
"-c",
"--config-path",
default="./configs/datasets.yaml",
type=click.File(),
required=True,
help="Dataset configuration file in YAML",
)
@click.option(
"--ids-path",
default="assets/data/test_acl_2024.json",
type=click.File(),
required=True,
help="Dataset configuration file in YAML",
)
@click.pass_context
def retrieve(ctx,
config_path, ids_path
):
initial_kwargs={ctx.args[i][2:]: ctx.args[i+1] for i in range(0, len(ctx.args), 2)}
kwargs = {"RETRIEVE": {}, "DEFAULT": {}}
for k, v in initial_kwargs.items():
if "num" in k:
kwargs["RETRIEVE"][k] = int(v)
elif "s_" in k:
kwargs["RETRIEVE"][k] = float(v)
elif "use_cocite" in k:
kwargs["RETRIEVE"][k] = bool(int(v))
else:
kwargs["RETRIEVE"][k] = v
config = ConfigReader.load(config_path, **kwargs)
check_embedding(config.DEFAULT.embedding)
check_env()
log_dir = config.DEFAULT.log_dir
retriever_name = config.RETRIEVE.retriever_name
cluster_to_filter = config.RETRIEVE.use_cluster_to_filter
co_cite = config.RETRIEVE.use_cocite
if not os.path.exists(log_dir):
os.makedirs(log_dir)
print(f"Created log directory: {log_dir}")
log_file = os.path.join(
log_dir,
"retriever_eval_{}_cocite-{}_cluster-{}.log".format(
retriever_name, co_cite, cluster_to_filter
),
)
logger.add(log_file, level=config.DEFAULT.log_level)
logger.info("=== Retriever name : {} ===".format(retriever_name))
logger.info("Loaded configuration:\n{}".format(OmegaConf.to_yaml(config)))
api_helper = APIHelper(config)
paper_client = PaperClient()
precision = 0
filtered_precision = 0
recall = 0
filtered_recall = 0
num = 0
gt_reference_num = 0
retrieve_paper_num = 0
label_num = 0
top_k_precision = {p: 0 for p in config.RETRIEVE.top_k_list}
top_k_recall = {p: 0 for p in config.RETRIEVE.top_k_list}
# Init Retriever
rt = RetrieverFactory.get_retriever_factory().create_retriever(
retriever_name,
config
)
for line in ids_path:
paper = json.loads(line)
logger.info("\nbegin generate paper hash id {}".format(paper["hash_id"]))
# 1. Get Background
paper = paper_client.get_paper_by_id(paper["hash_id"])
if "background" in paper.keys():
bg = paper["background"]
else:
logger.error(f"paper hash_id {paper['hash_id']} doesn't have background...")
continue
if "entities" in paper.keys():
entities = paper["entities"]
else:
entities = api_helper.generate_entity_list(bg)
logger.info("\norigin entities from background: {}".format(entities))
cite_type = config.RETRIEVE.cite_type
if cite_type in paper and len(paper[cite_type]) >= 5:
target_paper_id_list = paper[cite_type]
else:
logger.warning(
"hash_id {} cite paper num less than 5 ...".format(paper["hash_id"])
)
continue
# 2. Retrieve
result = rt.retrieve(
bg, entities, need_evaluate=True, target_paper_id_list=target_paper_id_list
)
filtered_precision += result["filtered_precision"]
precision += result["precision"]
filtered_recall += result["filtered_recall"]
gt_reference_num += result["gt_reference_num"]
retrieve_paper_num += result["retrieve_paper_num"]
recall += result["recall"]
label_num += result["label_num"]
for k, v in result["top_k_matrix"].items():
top_k_recall[k] += v["recall"]
top_k_precision[k] += v["precision"]
num += 1
if num >= 100:
break
continue
logger.info("=== Finish Report ===")
logger.info(f"{'Test Paper Num:':<25} {num}")
logger.info(f"{'Average Precision:':<25} {precision/num:.3f}")
logger.info(f"{'Average Recall:':<25} {recall/num:.3f}")
logger.info(f"{'Average GT Ref Paper Num:':<25} {gt_reference_num/num:.3f}")
logger.info(f"{'Average Retrieve Paper Num:':<25} {retrieve_paper_num/num:.3f}")
logger.info(f"{'Average Label Num:':<25} {label_num/num:.3f}")
# Print Eval Result
logger.info("=== Top-K Metrics ===")
logger.info(
f"=== USE_COCIT: {co_cite}, USE_CLUSTER_TO_FILTER: {cluster_to_filter} ==="
)
logger.info("| Top K | Recall | Precision |")
logger.info("|--------|--------|-----------|")
for k in config.RETRIEVE.top_k_list:
if k <= retrieve_paper_num / num:
logger.info(
f"| {k:<5} | {top_k_recall[k]/num:.3f} | {top_k_precision[k]/num:.3f} |"
)
logger.info("=" * 40)
if __name__ == "__main__":
main()
|