Commit
·
cc25de1
1
Parent(s):
c680313
Refactor data extraction logic to improve category handling and streamline data sanitization
Browse files- .gitignore +1 -4
- arvix.py +22 -10
.gitignore
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
/.cache
|
2 |
/__pycache__
|
3 |
.env
|
4 |
-
.env.local
|
5 |
-
.env.development.local
|
6 |
-
.env.test.local
|
7 |
-
.env.production.local
|
8 |
.DS_Store
|
|
|
|
1 |
/.cache
|
2 |
/__pycache__
|
3 |
.env
|
|
|
|
|
|
|
|
|
4 |
.DS_Store
|
5 |
+
*.json
|
arvix.py
CHANGED
@@ -51,18 +51,30 @@ def extract_data(category):
|
|
51 |
new_data = extract_new_data(category)
|
52 |
recent_data = extract_recent_data(category)
|
53 |
data = list(set(new_data + recent_data))
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
62 |
tools.write_data_to_file(id, 'arxiv.txt')
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
random.shuffle(sanitized_data)
|
65 |
-
|
|
|
66 |
|
67 |
def extract_arxiv_data():
|
68 |
if not tools.download_datafile('arxiv.txt'):
|
|
|
51 |
new_data = extract_new_data(category)
|
52 |
recent_data = extract_recent_data(category)
|
53 |
data = list(set(new_data + recent_data))
|
54 |
+
if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
|
55 |
+
category_list = []
|
56 |
+
for id in data:
|
57 |
+
if len(category_list) >= 3:
|
58 |
+
break
|
59 |
+
if tools.check_data_in_file(id, 'arxiv.txt'):
|
60 |
+
continue
|
61 |
+
else:
|
62 |
+
category_list.append(id)
|
63 |
+
for category_id in category_list:
|
64 |
+
sanitized_data.append(category_id)
|
65 |
tools.write_data_to_file(id, 'arxiv.txt')
|
66 |
+
else:
|
67 |
+
for id in data:
|
68 |
+
if len(sanitized_data) >= 12:
|
69 |
+
break
|
70 |
+
if tools.check_data_in_file(id, 'arxiv.txt'):
|
71 |
+
continue
|
72 |
+
else:
|
73 |
+
tools.write_data_to_file(id, 'arxiv.txt')
|
74 |
+
sanitized_data.append(id)
|
75 |
random.shuffle(sanitized_data)
|
76 |
+
print(len(sanitized_data))
|
77 |
+
return sanitized_data
|
78 |
|
79 |
def extract_arxiv_data():
|
80 |
if not tools.download_datafile('arxiv.txt'):
|