raannakasturi commited on
Commit
82ed732
·
verified ·
1 Parent(s): b37e434

Update arvix.py

Browse files
Files changed (1) hide show
  1. arvix.py +41 -63
arvix.py CHANGED
@@ -1,15 +1,19 @@
1
  import json
2
  import random
3
- import tools
 
4
  from bs4 import BeautifulSoup
5
 
 
 
 
6
  def fetch_new_page(category):
7
  url = f'https://arxiv.org/list/{category}/new'
8
- return tools.fetch_page(url)
9
 
10
  def fetch_recent_page(category):
11
  url = f'https://arxiv.org/list/{category}/recent'
12
- return tools.fetch_page(url)
13
 
14
  def extract_new_data(category):
15
  paper_ids = []
@@ -20,15 +24,12 @@ def extract_new_data(category):
20
  paper_contents = list.find_all('dd')
21
  titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
22
  for paper, title in zip(papers, titles):
23
- if not tools.verify_simple_title(title):
24
  continue
25
- else:
26
- paper_link = paper.find('a', href=True)
27
- if paper_link:
28
- paper_id = paper_link.text.strip().split(':')[1]
29
- paper_ids.append(paper_id)
30
- else:
31
- continue
32
  return paper_ids
33
 
34
  def extract_recent_data(category):
@@ -42,42 +43,22 @@ def extract_recent_data(category):
42
  if paper_link:
43
  paper_id = paper_link.text.strip().split(':')[1]
44
  paper_ids.append(paper_id)
45
- else:
46
- continue
47
  return paper_ids
48
 
49
  def extract_data(category):
50
- sanitized_data = []
51
  new_data = extract_new_data(category)
52
  recent_data = extract_recent_data(category)
53
- data = list(set(new_data + recent_data))
54
- if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
55
- category_list = []
56
- for id in data:
57
- if len(category_list) >= 1:
58
- break
59
- if tools.check_data_in_file(id, 'arxiv.txt'):
60
- continue
61
- else:
62
- category_list.append(id)
63
- for category_id in category_list:
64
- sanitized_data.append(category_id)
65
- tools.write_data_to_file(id, 'arxiv.txt')
66
- else:
67
- for id in data:
68
- if len(sanitized_data) >= 3:
69
- break
70
- if tools.check_data_in_file(id, 'arxiv.txt'):
71
- continue
72
- else:
73
- tools.write_data_to_file(id, 'arxiv.txt')
74
- sanitized_data.append(id)
75
- random.shuffle(sanitized_data)
76
- return sanitized_data
77
 
78
  def extract_arxiv_data():
79
- if not tools.download_datafile('arxiv.txt'):
80
- raise Exception("Failed to download datafile")
81
  categories = {
82
  "Astrophysics": ["astro-ph"],
83
  "Condensed Matter": ["cond-mat"],
@@ -98,33 +79,30 @@ def extract_arxiv_data():
98
  "Economics": ["econ"]
99
  }
100
  data = {}
 
 
101
  for category, subcategories in categories.items():
102
- category_data = {}
103
- all_ids = []
104
- temp_id_storage = []
105
  for subcategory in subcategories:
106
  ids = extract_data(subcategory)
107
- if len(ids) == 3:
108
- for id in ids:
109
- temp_id_storage.append(id)
110
- else:
111
- for id in ids:
112
- all_ids.append(id)
113
- for temp_id in temp_id_storage:
114
- all_ids.append(temp_id)
115
- random.shuffle(all_ids)
116
- if len(all_ids) > 3:
117
- print(f"Found more than 3 papers for {category}.")
118
- all_ids = all_ids[:3]
119
- category_data['count'] = len(all_ids)
120
- category_data['ids'] = all_ids
121
- data[category] = category_data
122
- data = json.dumps(data, indent=4, ensure_ascii=False)
123
- if not tools.upload_datafile('arxiv.txt'):
124
- raise Exception("Failed to upload datafile")
125
- return data
126
 
127
  if __name__ == '__main__':
128
  data = extract_arxiv_data()
129
  with open('arxiv_data.json', 'w') as f:
130
- f.write(data)
 
1
  import json
2
  import random
3
+ import os
4
+ import sys
5
  from bs4 import BeautifulSoup
6
 
7
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
8
+ import utils
9
+
10
  def fetch_new_page(category):
11
  url = f'https://arxiv.org/list/{category}/new'
12
+ return utils.fetch_page(url)
13
 
14
  def fetch_recent_page(category):
15
  url = f'https://arxiv.org/list/{category}/recent'
16
+ return utils.fetch_page(url)
17
 
18
  def extract_new_data(category):
19
  paper_ids = []
 
24
  paper_contents = list.find_all('dd')
25
  titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
26
  for paper, title in zip(papers, titles):
27
+ if not utils.verify_simple_title(title):
28
  continue
29
+ paper_link = paper.find('a', href=True)
30
+ if paper_link:
31
+ paper_id = paper_link.text.strip().split(':')[1]
32
+ paper_ids.append(paper_id)
 
 
 
33
  return paper_ids
34
 
35
  def extract_recent_data(category):
 
43
  if paper_link:
44
  paper_id = paper_link.text.strip().split(':')[1]
45
  paper_ids.append(paper_id)
 
 
46
  return paper_ids
47
 
48
  def extract_data(category):
49
+ all_ids = set()
50
  new_data = extract_new_data(category)
51
  recent_data = extract_recent_data(category)
52
+ combined_data = new_data + recent_data
53
+ for paper_id in combined_data:
54
+ if not utils.check_data_in_file(paper_id, 'arxiv.txt'):
55
+ utils.write_data_to_file(paper_id, 'arxiv.txt')
56
+ all_ids.add(paper_id)
57
+ if len(all_ids) >= 4:
58
+ break
59
+ return list(all_ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def extract_arxiv_data():
 
 
62
  categories = {
63
  "Astrophysics": ["astro-ph"],
64
  "Condensed Matter": ["cond-mat"],
 
79
  "Economics": ["econ"]
80
  }
81
  data = {}
82
+ used_ids = set()
83
+
84
  for category, subcategories in categories.items():
85
+ category_ids = set()
 
 
86
  for subcategory in subcategories:
87
  ids = extract_data(subcategory)
88
+ for paper_id in ids:
89
+ if paper_id not in used_ids:
90
+ category_ids.add(paper_id)
91
+ used_ids.add(paper_id)
92
+ if len(category_ids) == 4:
93
+ break
94
+ if len(category_ids) == 4:
95
+ break
96
+
97
+ # Ensure exactly 4 IDs for each category
98
+ while len(category_ids) < 4:
99
+ category_ids.add(random.choice(list(used_ids)))
100
+
101
+ data[category] = {"ids": list(category_ids)}
102
+
103
+ return json.dumps(data, indent=4, ensure_ascii=False)
 
 
 
104
 
105
  if __name__ == '__main__':
106
  data = extract_arxiv_data()
107
  with open('arxiv_data.json', 'w') as f:
108
+ f.write(data)