paper-hero / run.py
Spico's picture
- refactor exhausted search API
7b40c73
from src.interfaces.aclanthology import AclanthologyPaperList
from src.interfaces.arxiv import ArxivPaperList
from src.interfaces.dblp import DblpPaperList
from src.utils import (
dump_paper_list_to_jsonlines,
dump_paper_list_to_markdown_checklist,
)
if __name__ == "__main__":
# use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
# `ee_query`` is an example, and you don't have to fill all the fields
ee_query = {
"title": [
["information extraction"],
["event", "extraction"],
["event", "argument", "extraction"],
["event", "detection"],
["event", "classification"],
["event", "tracking"],
["event", "relation", "extraction"],
["event", "prediction"],
["script", "learning"],
],
"venue": [
["acl"],
["emnlp"],
["naacl"],
["coling"],
["findings"],
["tacl"],
["cl"],
],
"author": [
["Heng Ji"],
["Dan Roth"],
],
"year": [
# multiple time spans with closed interval: ["2006", "2013"] means 2006-2013
["2006", "2013"],
["2018", "2022"],
],
"month": [
# the same as the `year` field
["4", "11"],
],
}
ee_papers = acl_paper_list.search(ee_query)
dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
dump_paper_list_to_jsonlines(ee_papers, "results/ee-paper-list.jsonl")
doc_query = {
"title": [
["document-level"],
],
"venue": [
["acl"],
["emnlp"],
["naacl"],
["coling"],
["findings"],
["tacl"],
["cl"],
],
}
doc_papers = acl_paper_list.search(doc_query)
dump_paper_list_to_markdown_checklist(doc_papers, "results/doc-paper-list.md")
dump_paper_list_to_jsonlines(doc_papers, "results/doc-paper-list.jsonl")
# arxiv papers
arxiv_paper_list = ArxivPaperList(
"cache/ee-arxiv.xml",
use_cache=True,
title=(
"Event Extraction OR Event Argument Extraction OR Event Detection"
" OR Event Classification OR Event Tracking"
" OR Event Relation Extraction OR Information Extraction"
" OR Event Prediction OR Script Learning"
),
category="cs.CL",
)
arxiv_ee_query = {
"title": [
["information extraction"],
["event", "extraction"],
["event", "argument", "extraction"],
["event", "detection"],
["event", "classification"],
["event", "tracking"],
["event", "relation", "extraction"],
["event", "prediction"],
["script", "learning"],
],
"venue": [
["cs.CL"],
],
}
arxiv_ee_papers = arxiv_paper_list.search(arxiv_ee_query)
dump_paper_list_to_markdown_checklist(
arxiv_ee_papers, "results/arxiv-ee-paper-list.md"
)
dump_paper_list_to_jsonlines(arxiv_ee_papers, "results/arxiv-ee-paper-list.jsonl")
# dblp papers
dblp_paper_list = DblpPaperList(
"./cache/dblp.json",
use_cache=True,
query="Event|Information|Argument|Script Extraction|Classification|Tracking|Prediction|Learning",
max_results=50000,
)
dblp_ee_query = {
"title": [
["information extraction"],
["event", "extraction"],
["event", "argument", "extraction"],
["event", "detection"],
["event", "classification"],
["event", "tracking"],
["event", "relation", "extraction"],
["event", "prediction"],
["script", "learning"],
],
"venue": [
["aaai"],
["ijcai"],
["icml"],
["iclr"],
["nips"],
["neurips"],
["sigir"],
["cvpr"],
["iccv"],
],
}
dblp_ee_papers = dblp_paper_list.search(dblp_ee_query)
dump_paper_list_to_markdown_checklist(
dblp_ee_papers, "results/dblp-ee-paper-list.md"
)
dump_paper_list_to_jsonlines(dblp_ee_papers, "results/dblp-ee-paper-list.jsonl")