File size: 3,830 Bytes
9688967
 
 
82ed732
 
9688967
 
82ed732
9688967
 
 
82ed732
9688967
 
 
 
 
 
 
 
 
 
82ed732
9688967
82ed732
 
 
 
9688967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82ed732
9688967
 
82ed732
 
 
 
 
8a1664b
82ed732
 
9688967
 
b310130
 
9688967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82ed732
9688967
82ed732
9688967
 
82ed732
 
 
 
8a1664b
82ed732
8a1664b
82ed732
8a1664b
82ed732
5a2457c
b310130
 
82ed732
9688967
 
 
 
82ed732
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import json
import random
from bs4 import BeautifulSoup
import utils

def fetch_new_page(category):
    url = f'https://arxiv.org/list/{category}/new'
    return utils.fetch_page(url)

def fetch_recent_page(category):
    url = f'https://arxiv.org/list/{category}/recent'
    return utils.fetch_page(url)

def extract_new_data(category):
    paper_ids = []
    page_content = fetch_new_page(category)
    lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
    for list in lists:
        papers = list.find_all('dt')
        paper_contents = list.find_all('dd')
        titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
        for paper, title in zip(papers, titles):
            if not utils.verify_simple_title(title):
                continue
            paper_link = paper.find('a', href=True)
            if paper_link:
                paper_id = paper_link.text.strip().split(':')[1]
                paper_ids.append(paper_id)
    return paper_ids

def extract_recent_data(category):
    paper_ids = []
    page_content = fetch_recent_page(category)
    lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
    for list in lists:
        papers = list.find_all('dt')
        for paper in papers:
            paper_link = paper.find('a', href=True)
            if paper_link:
                paper_id = paper_link.text.strip().split(':')[1]
                paper_ids.append(paper_id)
    return paper_ids

def extract_data(category):
    all_ids = set()
    new_data = extract_new_data(category)
    recent_data = extract_recent_data(category)
    combined_data = new_data + recent_data
    for paper_id in combined_data:
        if not utils.check_data_in_file(paper_id, 'arxiv.txt'):
            utils.write_data_to_file(paper_id, 'arxiv.txt')
            all_ids.add(paper_id)
        if len(all_ids) >= 2:
            break
    return list(all_ids)

def extract_arxiv_data():
    if not utils.download_datafile('arxiv.txt'):
        raise Exception("Failed to download datafile")
    categories = {
        "Astrophysics": ["astro-ph"],
        "Condensed Matter": ["cond-mat"],
        "General Relativity and Quantum Cosmology": ["gr-qc"],
        "High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
        "Mathematical Physics": ["math-ph"],
        "Nonlinear Sciences": ["nlin"],
        "Nuclear Experiment": ["nucl-ex"],
        "Nuclear Theory": ["nucl-th"],
        "Physics": ["physics"],
        "Quantum Physics": ["quant-ph"],
        "Mathematics": ["math"],
        "Computer Science": ["cs"],
        "Quantitative Biology": ["q-bio"],
        "Quantitative Finance": ["q-fin"],
        "Statistics": ["stat"],
        "Electrical Engineering and Systems Science": ["eess"],
        "Economics": ["econ"]
    }
    data = {}
    used_ids = set()
    for category, subcategories in categories.items():
        category_ids = set()
        for subcategory in subcategories:
            ids = extract_data(subcategory)
            for paper_id in ids:
                if paper_id not in used_ids:
                    category_ids.add(paper_id)
                    used_ids.add(paper_id)
                if len(category_ids) == 2:
                    break
            if len(category_ids) == 2:
                break
        while len(category_ids) < 2:
            category_ids.add(random.choice(list(used_ids)))
        data[category] = {"ids": list(category_ids), "count": len(category_ids)}
    if not utils.upload_datafile('arxiv.txt'):
        raise Exception("Failed to upload datafile")
    return json.dumps(data, indent=4, ensure_ascii=False)

if __name__ == '__main__':
    data = extract_arxiv_data()
    with open('arxiv_data.json', 'w') as f:
        f.write(data)