File size: 3,830 Bytes
9688967 82ed732 9688967 82ed732 9688967 82ed732 9688967 82ed732 9688967 82ed732 9688967 82ed732 9688967 82ed732 8a1664b 82ed732 9688967 b310130 9688967 82ed732 9688967 82ed732 9688967 82ed732 8a1664b 82ed732 8a1664b 82ed732 8a1664b 82ed732 5a2457c b310130 82ed732 9688967 82ed732 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import json
import random
from bs4 import BeautifulSoup
import utils
def fetch_new_page(category):
url = f'https://arxiv.org/list/{category}/new'
return utils.fetch_page(url)
def fetch_recent_page(category):
url = f'https://arxiv.org/list/{category}/recent'
return utils.fetch_page(url)
def extract_new_data(category):
paper_ids = []
page_content = fetch_new_page(category)
lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
for list in lists:
papers = list.find_all('dt')
paper_contents = list.find_all('dd')
titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
for paper, title in zip(papers, titles):
if not utils.verify_simple_title(title):
continue
paper_link = paper.find('a', href=True)
if paper_link:
paper_id = paper_link.text.strip().split(':')[1]
paper_ids.append(paper_id)
return paper_ids
def extract_recent_data(category):
paper_ids = []
page_content = fetch_recent_page(category)
lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
for list in lists:
papers = list.find_all('dt')
for paper in papers:
paper_link = paper.find('a', href=True)
if paper_link:
paper_id = paper_link.text.strip().split(':')[1]
paper_ids.append(paper_id)
return paper_ids
def extract_data(category):
all_ids = set()
new_data = extract_new_data(category)
recent_data = extract_recent_data(category)
combined_data = new_data + recent_data
for paper_id in combined_data:
if not utils.check_data_in_file(paper_id, 'arxiv.txt'):
utils.write_data_to_file(paper_id, 'arxiv.txt')
all_ids.add(paper_id)
if len(all_ids) >= 2:
break
return list(all_ids)
def extract_arxiv_data():
if not utils.download_datafile('arxiv.txt'):
raise Exception("Failed to download datafile")
categories = {
"Astrophysics": ["astro-ph"],
"Condensed Matter": ["cond-mat"],
"General Relativity and Quantum Cosmology": ["gr-qc"],
"High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
"Mathematical Physics": ["math-ph"],
"Nonlinear Sciences": ["nlin"],
"Nuclear Experiment": ["nucl-ex"],
"Nuclear Theory": ["nucl-th"],
"Physics": ["physics"],
"Quantum Physics": ["quant-ph"],
"Mathematics": ["math"],
"Computer Science": ["cs"],
"Quantitative Biology": ["q-bio"],
"Quantitative Finance": ["q-fin"],
"Statistics": ["stat"],
"Electrical Engineering and Systems Science": ["eess"],
"Economics": ["econ"]
}
data = {}
used_ids = set()
for category, subcategories in categories.items():
category_ids = set()
for subcategory in subcategories:
ids = extract_data(subcategory)
for paper_id in ids:
if paper_id not in used_ids:
category_ids.add(paper_id)
used_ids.add(paper_id)
if len(category_ids) == 2:
break
if len(category_ids) == 2:
break
while len(category_ids) < 2:
category_ids.add(random.choice(list(used_ids)))
data[category] = {"ids": list(category_ids), "count": len(category_ids)}
if not utils.upload_datafile('arxiv.txt'):
raise Exception("Failed to upload datafile")
return json.dumps(data, indent=4, ensure_ascii=False)
if __name__ == '__main__':
data = extract_arxiv_data()
with open('arxiv_data.json', 'w') as f:
f.write(data)
|