InteractiveSurvey / src /evaluation /extract_abstracts.py
technicolor's picture
Add Django InteractiveSurvey project
a97d040
raw
history blame
1.27 kB
import os
import feedparser
import urllib.parse
def get_arxiv_abstract_by_title(title):
query_title = urllib.parse.quote(f'"{title}"')
url = f"http://export.arxiv.org/api/query?search_query=ti:{query_title}&max_results=1"
feed = feedparser.parse(url)
if feed.entries:
return feed.entries[0].summary
return "No abstract found"
def main():
d = os.path.dirname(os.path.abspath(__file__))
inp = os.path.join(d, 'arxiv_downloads')
out = os.path.join(d, 'abstract_survey')
if not os.path.exists(out):
os.makedirs(out)
for folder_name in os.listdir(inp):
fp = os.path.join(inp, folder_name)
if not os.path.isdir(fp):
continue
md = os.path.join(out, f"{folder_name}.md")
data = []
pdfs = sorted([x for x in os.listdir(fp) if x.lower().endswith('.pdf')])
for i, pdf in enumerate(pdfs, 1):
title = os.path.splitext(pdf)[0]
abs_text = get_arxiv_abstract_by_title(title)
data.append(f"{i}: {pdf}\n{abs_text}\n")
with open(md, 'w', encoding='utf-8') as f:
f.write(f"# {folder_name} Abstracts\n\n")
for line in data:
f.write(line + "\n")
if __name__ == "__main__":
main()