raannakasturi's picture
Refactor data extraction logic to improve category handling and streamline data sanitization
cc25de1
raw
history blame
4.6 kB
import json
import random
import tools
from bs4 import BeautifulSoup
def fetch_new_page(category):
url = f'https://arxiv.org/list/{category}/new'
return tools.fetch_page(url)
def fetch_recent_page(category):
url = f'https://arxiv.org/list/{category}/recent'
return tools.fetch_page(url)
def extract_new_data(category):
paper_ids = []
page_content = fetch_new_page(category)
lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
for list in lists:
papers = list.find_all('dt')
paper_contents = list.find_all('dd')
titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
for paper, title in zip(papers, titles):
if not tools.verify_simple_title(title):
continue
else:
paper_link = paper.find('a', href=True)
if paper_link:
paper_id = paper_link.text.strip().split(':')[1]
paper_ids.append(paper_id)
else:
continue
return paper_ids
def extract_recent_data(category):
paper_ids = []
page_content = fetch_recent_page(category)
lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
for list in lists:
papers = list.find_all('dt')
for paper in papers:
paper_link = paper.find('a', href=True)
if paper_link:
paper_id = paper_link.text.strip().split(':')[1]
paper_ids.append(paper_id)
else:
continue
return paper_ids
def extract_data(category):
sanitized_data = []
new_data = extract_new_data(category)
recent_data = extract_recent_data(category)
data = list(set(new_data + recent_data))
if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
category_list = []
for id in data:
if len(category_list) >= 3:
break
if tools.check_data_in_file(id, 'arxiv.txt'):
continue
else:
category_list.append(id)
for category_id in category_list:
sanitized_data.append(category_id)
tools.write_data_to_file(id, 'arxiv.txt')
else:
for id in data:
if len(sanitized_data) >= 12:
break
if tools.check_data_in_file(id, 'arxiv.txt'):
continue
else:
tools.write_data_to_file(id, 'arxiv.txt')
sanitized_data.append(id)
random.shuffle(sanitized_data)
print(len(sanitized_data))
return sanitized_data
def extract_arxiv_data():
if not tools.download_datafile('arxiv.txt'):
raise Exception("Failed to download datafile")
categories = {
"Astrophysics": ["astro-ph"],
"Condensed Matter": ["cond-mat"],
"General Relativity and Quantum Cosmology": ["gr-qc"],
"High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
"Mathematical Physics": ["math-ph"],
"Nonlinear Sciences": ["nlin"],
"Nuclear Experiment": ["nucl-ex"],
"Nuclear Theory": ["nucl-th"],
"Physics": ["physics"],
"Quantum Physics": ["quant-ph"],
"Mathematics": ["math"],
"Computer Science": ["cs"],
"Quantitative Biology": ["q-bio"],
"Quantitative Finance": ["q-fin"],
"Statistics": ["stat"],
"Electrical Engineering and Systems Science": ["eess"],
"Economics": ["econ"]
}
data = {}
for category, subcategories in categories.items():
category_data = {}
all_ids = []
for subcategory in subcategories:
ids = extract_data(subcategory)
for id in ids:
all_ids.append(id)
if len(all_ids) > 12:
print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
random.shuffle(all_ids)
all_ids = all_ids[:12]
category_data['count'] = len(all_ids)
category_data['ids'] = all_ids
data[category] = category_data
data = json.dumps(data, indent=4, ensure_ascii=False)
if not tools.upload_datafile('arxiv.txt'):
raise Exception("Failed to upload datafile")
return data
if __name__ == '__main__':
data = extract_arxiv_data()
with open('arxiv_data.json', 'w') as f:
f.write(data)