Spaces:

raannakasturi
/

ReXploreIDFetchingAPI

Sleeping

App Files Files Community

ReXploreIDFetchingAPI / arvix.py

raannakasturi

Add count of category and PMC IDs to data output in extract_arxiv_data and extract_pmc_data functions

5a2457c 6 months ago

raw

history blame

3.83 kB

	import json
	import random
	from bs4 import BeautifulSoup
	import utils

	def fetch_new_page(category):
	url = f'https://arxiv.org/list/{category}/new'
	return utils.fetch_page(url)

	def fetch_recent_page(category):
	url = f'https://arxiv.org/list/{category}/recent'
	return utils.fetch_page(url)

	def extract_new_data(category):
	paper_ids = []
	page_content = fetch_new_page(category)
	lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
	for list in lists:
	papers = list.find_all('dt')
	paper_contents = list.find_all('dd')
	titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
	for paper, title in zip(papers, titles):
	if not utils.verify_simple_title(title):
	continue
	paper_link = paper.find('a', href=True)
	if paper_link:
	paper_id = paper_link.text.strip().split(':')[1]
	paper_ids.append(paper_id)
	return paper_ids

	def extract_recent_data(category):
	paper_ids = []
	page_content = fetch_recent_page(category)
	lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
	for list in lists:
	papers = list.find_all('dt')
	for paper in papers:
	paper_link = paper.find('a', href=True)
	if paper_link:
	paper_id = paper_link.text.strip().split(':')[1]
	paper_ids.append(paper_id)
	return paper_ids

	def extract_data(category):
	all_ids = set()
	new_data = extract_new_data(category)
	recent_data = extract_recent_data(category)
	combined_data = new_data + recent_data
	for paper_id in combined_data:
	if not utils.check_data_in_file(paper_id, 'arxiv.txt'):
	utils.write_data_to_file(paper_id, 'arxiv.txt')
	all_ids.add(paper_id)
	if len(all_ids) >= 4:
	break
	return list(all_ids)

	def extract_arxiv_data():
	if not utils.download_datafile('arxiv.txt'):
	raise Exception("Failed to download datafile")
	categories = {
	"Astrophysics": ["astro-ph"],
	"Condensed Matter": ["cond-mat"],
	"General Relativity and Quantum Cosmology": ["gr-qc"],
	"High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
	"Mathematical Physics": ["math-ph"],
	"Nonlinear Sciences": ["nlin"],
	"Nuclear Experiment": ["nucl-ex"],
	"Nuclear Theory": ["nucl-th"],
	"Physics": ["physics"],
	"Quantum Physics": ["quant-ph"],
	"Mathematics": ["math"],
	"Computer Science": ["cs"],
	"Quantitative Biology": ["q-bio"],
	"Quantitative Finance": ["q-fin"],
	"Statistics": ["stat"],
	"Electrical Engineering and Systems Science": ["eess"],
	"Economics": ["econ"]
	}
	data = {}
	used_ids = set()
	for category, subcategories in categories.items():
	category_ids = set()
	for subcategory in subcategories:
	ids = extract_data(subcategory)
	for paper_id in ids:
	if paper_id not in used_ids:
	category_ids.add(paper_id)
	used_ids.add(paper_id)
	if len(category_ids) == 4:
	break
	if len(category_ids) == 4:
	break
	while len(category_ids) < 4:
	category_ids.add(random.choice(list(used_ids)))
	data[category] = {"ids": list(category_ids), "count": len(category_ids)}
	if not utils.upload_datafile('arxiv.txt'):
	raise Exception("Failed to upload datafile")
	return json.dumps(data, indent=4, ensure_ascii=False)

	if __name__ == '__main__':
	data = extract_arxiv_data()
	with open('arxiv_data.json', 'w') as f:
	f.write(data)