raannakasturi commited on
Commit
cc25de1
·
1 Parent(s): c680313

Refactor data extraction logic to improve category handling and streamline data sanitization

Browse files
Files changed (2) hide show
  1. .gitignore +1 -4
  2. arvix.py +22 -10
.gitignore CHANGED
@@ -1,8 +1,5 @@
1
  /.cache
2
  /__pycache__
3
  .env
4
- .env.local
5
- .env.development.local
6
- .env.test.local
7
- .env.production.local
8
  .DS_Store
 
 
1
  /.cache
2
  /__pycache__
3
  .env
 
 
 
 
4
  .DS_Store
5
+ *.json
arvix.py CHANGED
@@ -51,18 +51,30 @@ def extract_data(category):
51
  new_data = extract_new_data(category)
52
  recent_data = extract_recent_data(category)
53
  data = list(set(new_data + recent_data))
54
- for id in data:
55
- if len(sanitized_data) >= 12:
56
- break
57
- if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
58
- id = id[:3]
59
- if tools.check_data_in_file(id, 'arxiv.txt'):
60
- continue
61
- else:
 
 
 
62
  tools.write_data_to_file(id, 'arxiv.txt')
63
- sanitized_data.append(id)
 
 
 
 
 
 
 
 
64
  random.shuffle(sanitized_data)
65
- return sanitized_data[:12]
 
66
 
67
  def extract_arxiv_data():
68
  if not tools.download_datafile('arxiv.txt'):
 
51
  new_data = extract_new_data(category)
52
  recent_data = extract_recent_data(category)
53
  data = list(set(new_data + recent_data))
54
+ if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
55
+ category_list = []
56
+ for id in data:
57
+ if len(category_list) >= 3:
58
+ break
59
+ if tools.check_data_in_file(id, 'arxiv.txt'):
60
+ continue
61
+ else:
62
+ category_list.append(id)
63
+ for category_id in category_list:
64
+ sanitized_data.append(category_id)
65
  tools.write_data_to_file(id, 'arxiv.txt')
66
+ else:
67
+ for id in data:
68
+ if len(sanitized_data) >= 12:
69
+ break
70
+ if tools.check_data_in_file(id, 'arxiv.txt'):
71
+ continue
72
+ else:
73
+ tools.write_data_to_file(id, 'arxiv.txt')
74
+ sanitized_data.append(id)
75
  random.shuffle(sanitized_data)
76
+ print(len(sanitized_data))
77
+ return sanitized_data
78
 
79
  def extract_arxiv_data():
80
  if not tools.download_datafile('arxiv.txt'):