from src.interfaces.aclanthology import AclanthologyPaperList from src.interfaces.arxiv import ArxivPaperList from src.interfaces.dblp import DblpPaperList from src.utils import ( dump_paper_list_to_jsonlines, dump_paper_list_to_markdown_checklist, ) if __name__ == "__main__": # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first acl_paper_list = AclanthologyPaperList("cache/aclanthology.json") # `ee_query`` is an example, and you don't have to fill all the fields ee_query = { "title": [ ["information extraction"], ["event", "extraction"], ["event", "argument", "extraction"], ["event", "detection"], ["event", "classification"], ["event", "tracking"], ["event", "relation", "extraction"], ["event", "prediction"], ["script", "learning"], ], "venue": [ ["acl"], ["emnlp"], ["naacl"], ["coling"], ["findings"], ["tacl"], ["cl"], ], "author": [ ["Heng Ji"], ["Dan Roth"], ], "year": [ # multiple time spans with closed interval: ["2006", "2013"] means 2006-2013 ["2006", "2013"], ["2018", "2022"], ], "month": [ # the same as the `year` field ["4", "11"], ], } ee_papers = acl_paper_list.search(ee_query) dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md") dump_paper_list_to_jsonlines(ee_papers, "results/ee-paper-list.jsonl") doc_query = { "title": [ ["document-level"], ], "venue": [ ["acl"], ["emnlp"], ["naacl"], ["coling"], ["findings"], ["tacl"], ["cl"], ], } doc_papers = acl_paper_list.search(doc_query) dump_paper_list_to_markdown_checklist(doc_papers, "results/doc-paper-list.md") dump_paper_list_to_jsonlines(doc_papers, "results/doc-paper-list.jsonl") # arxiv papers arxiv_paper_list = ArxivPaperList( "cache/ee-arxiv.xml", use_cache=True, title=( "Event Extraction OR Event Argument Extraction OR Event Detection" " OR Event Classification OR Event Tracking" " OR Event Relation Extraction OR Information Extraction" " OR Event Prediction OR Script Learning" ), category="cs.CL", ) arxiv_ee_query = { "title": [ ["information extraction"], ["event", "extraction"], ["event", "argument", "extraction"], ["event", "detection"], ["event", "classification"], ["event", "tracking"], ["event", "relation", "extraction"], ["event", "prediction"], ["script", "learning"], ], "venue": [ ["cs.CL"], ], } arxiv_ee_papers = arxiv_paper_list.search(arxiv_ee_query) dump_paper_list_to_markdown_checklist( arxiv_ee_papers, "results/arxiv-ee-paper-list.md" ) dump_paper_list_to_jsonlines(arxiv_ee_papers, "results/arxiv-ee-paper-list.jsonl") # dblp papers dblp_paper_list = DblpPaperList( "./cache/dblp.json", use_cache=True, query="Event|Information|Argument|Script Extraction|Classification|Tracking|Prediction|Learning", max_results=50000, ) dblp_ee_query = { "title": [ ["information extraction"], ["event", "extraction"], ["event", "argument", "extraction"], ["event", "detection"], ["event", "classification"], ["event", "tracking"], ["event", "relation", "extraction"], ["event", "prediction"], ["script", "learning"], ], "venue": [ ["aaai"], ["ijcai"], ["icml"], ["iclr"], ["nips"], ["neurips"], ["sigir"], ["cvpr"], ["iccv"], ], } dblp_ee_papers = dblp_paper_list.search(dblp_ee_query) dump_paper_list_to_markdown_checklist( dblp_ee_papers, "results/dblp-ee-paper-list.md" ) dump_paper_list_to_jsonlines(dblp_ee_papers, "results/dblp-ee-paper-list.jsonl")