Terry Zhuo commited on
Commit
b406d3a
·
1 Parent(s): 3f2364a
Files changed (1) hide show
  1. azure_count_ip_data.py +47 -35
azure_count_ip_data.py CHANGED
@@ -10,22 +10,26 @@ from typing import Dict, Set, Tuple, Optional
10
  from log_reader import RemoteLogReader
11
 
12
  # List of IP addresses we care about
13
- WHITELIST_IPS = [
14
- "199.111.212.5",
15
- "175.159.122.63",
16
- "109.245.193.97",
17
- "158.195.18.232",
18
- "2607:fea8:4f40:4b00:e5b9:9806:6b69:233b",
19
- "66.254.231.49",
20
- "129.74.154.194",
21
- "175.196.44.217",
22
- "2601:600:8d00:9510:1d77:b610:9358:f443",
23
- "74.90.222.68",
24
- "2a02:169:3e9:0:6ce8:e76f:faed:c830",
25
- "70.50.179.57",
26
- "2a02:842a:24:5a01:8cd6:5b22:1189:6035",
27
- "2408:8418:6390:7603:40b:555f:774:a05d"
28
- ]
 
 
 
 
29
 
30
  logging.basicConfig(level=logging.WARNING)
31
  log = logging.getLogger(__name__)
@@ -108,16 +112,19 @@ def get_file_data(content: str) -> Tuple[Optional[str], bool]:
108
  return None, False
109
 
110
  def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18") -> Dict[str, int]:
111
- """Count files per IP address from the given start date"""
112
  # Convert start date string to datetime
113
  start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
114
- ip_counts = defaultdict(int)
115
 
116
  try:
117
  # Get current date for iteration
118
  current_date = start_date
119
  today = datetime.now()
120
 
 
 
 
121
  while current_date <= today:
122
  date_str = current_date.strftime("%Y_%m_%d")
123
 
@@ -132,8 +139,9 @@ def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02
132
  # Convert messages to file content format
133
  content = '\n'.join(json.dumps(msg) for msg in messages)
134
  ip, vote_conditions_met = get_file_data(content)
135
- if vote_conditions_met and ip:
136
- ip_counts[ip] += 1
 
137
 
138
  except Exception as e:
139
  log.error(f"Error processing logs for date {date_str}: {e}")
@@ -144,10 +152,10 @@ def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02
144
  except Exception as e:
145
  log.error(f"Error accessing logs: {e}")
146
 
147
- return dict(ip_counts)
148
 
149
- def download_files_by_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18", check_sandbox: bool = True) -> None:
150
- """Download files and organize them by IP address
151
 
152
  Args:
153
  reader: RemoteLogReader instance
@@ -158,6 +166,9 @@ def download_files_by_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_
158
  data_dir = os.path.join(os.getcwd(), "data")
159
  os.makedirs(data_dir, exist_ok=True)
160
 
 
 
 
161
  # Convert start date string to datetime
162
  start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
163
 
@@ -186,11 +197,12 @@ def download_files_by_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_
186
  content = '\n'.join(json.dumps(msg) for msg in messages)
187
  ip = get_ip_from_jsonl(content)
188
 
189
- if ip:
190
- # Create directory structure for this IP
191
- ip_dir = os.path.join(data_dir, ip)
192
- valid_dir = os.path.join(ip_dir, "valid")
193
- invalid_dir = os.path.join(ip_dir, "invalid")
 
194
  os.makedirs(valid_dir, exist_ok=True)
195
  os.makedirs(invalid_dir, exist_ok=True)
196
 
@@ -226,21 +238,21 @@ def main():
226
  reader = RemoteLogReader()
227
 
228
  # Add argument parser for optional parameters
229
- parser = argparse.ArgumentParser(description='Download and organize conversation files by IP')
230
  parser.add_argument('--sandbox-check', action='store_true', help='Check for matching sandbox logs')
231
  parser.add_argument('--download', action='store_true', help='Enable file download')
232
  args = parser.parse_args()
233
 
234
  # Download files if enabled
235
  if args.download:
236
- print("\nDownloading files and organizing by IP address...")
237
- download_files_by_ip(reader, check_sandbox=args.sandbox_check)
238
 
239
  # Count and display statistics
240
- ip_counts = count_files_per_ip(reader)
241
- print("\nFile counts per IP address:")
242
- for ip, count in sorted(ip_counts.items(), key=lambda x: x[1], reverse=True):
243
- print(f"IP: {ip:<15} Count: {count}")
244
 
245
  if __name__ == "__main__":
246
  main()
 
10
  from log_reader import RemoteLogReader
11
 
12
  # List of IP addresses we care about
13
+ WHITELIST_IPS_DICT = {
14
+ "Chen Gong": ["199.111.212.5"],
15
+ "Juyong Jiang": ["175.159.122.63"],
16
+ "Kenneth Hamilton": ["109.245.193.97"],
17
+ "Marek Suppa": ["158.195.18.232"],
18
+ "Max Tian": ["2607:fea8:4f40:4b00:e5b9:9806:6b69:233b"],
19
+ "Mengzhao Jia": ["66.254.231.49"],
20
+ "Noah Ziems": ["129.74.154.194"],
21
+ "Sabina A": ["175.196.44.217"],
22
+ "Wenhao Yu": ["2601:600:8d00:9510:1d77:b610:9358:f443"],
23
+ "Vaisakhi Mishra": ["74.90.222.68"],
24
+ "Kumar Shridhar": ["129.132.145.250"],
25
+ "Viktor Gal": ["2a02:169:3e9:0:6ce8:e76f:faed:c830"],
26
+ "Guangyu Song": ["70.50.179.57"],
27
+ "Bhupesh Bishnoi": ["2a02:842a:24:5a01:8cd6:5b22:1189:6035"],
28
+ "Zheng Liu": ["2408:8418:6390:7603:40b:555f:774:a05d"]
29
+ }
30
+
31
+ # Flatten IP list for backward compatibility
32
+ WHITELIST_IPS = [ip for ips in WHITELIST_IPS_DICT.values() for ip in ips]
33
 
34
  logging.basicConfig(level=logging.WARNING)
35
  log = logging.getLogger(__name__)
 
112
  return None, False
113
 
114
  def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18") -> Dict[str, int]:
115
+ """Count files per name from the given start date"""
116
  # Convert start date string to datetime
117
  start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
118
+ name_counts = defaultdict(int)
119
 
120
  try:
121
  # Get current date for iteration
122
  current_date = start_date
123
  today = datetime.now()
124
 
125
+ # Create reverse mapping of IP to name
126
+ ip_to_name = {ip: name for name, ips in WHITELIST_IPS_DICT.items() for ip in ips}
127
+
128
  while current_date <= today:
129
  date_str = current_date.strftime("%Y_%m_%d")
130
 
 
139
  # Convert messages to file content format
140
  content = '\n'.join(json.dumps(msg) for msg in messages)
141
  ip, vote_conditions_met = get_file_data(content)
142
+ if vote_conditions_met and ip and ip in ip_to_name:
143
+ name = ip_to_name[ip]
144
+ name_counts[name] += 1
145
 
146
  except Exception as e:
147
  log.error(f"Error processing logs for date {date_str}: {e}")
 
152
  except Exception as e:
153
  log.error(f"Error accessing logs: {e}")
154
 
155
+ return dict(name_counts)
156
 
157
+ def download_files_by_name(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18", check_sandbox: bool = True) -> None:
158
+ """Download files and organize them by annotator name
159
 
160
  Args:
161
  reader: RemoteLogReader instance
 
166
  data_dir = os.path.join(os.getcwd(), "data")
167
  os.makedirs(data_dir, exist_ok=True)
168
 
169
+ # Create reverse mapping of IP to name
170
+ ip_to_name = {ip: name for name, ips in WHITELIST_IPS_DICT.items() for ip in ips}
171
+
172
  # Convert start date string to datetime
173
  start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
174
 
 
197
  content = '\n'.join(json.dumps(msg) for msg in messages)
198
  ip = get_ip_from_jsonl(content)
199
 
200
+ if ip and ip in ip_to_name:
201
+ name = ip_to_name[ip]
202
+ # Create directory structure for this name
203
+ name_dir = os.path.join(data_dir, name)
204
+ valid_dir = os.path.join(name_dir, "valid")
205
+ invalid_dir = os.path.join(name_dir, "invalid")
206
  os.makedirs(valid_dir, exist_ok=True)
207
  os.makedirs(invalid_dir, exist_ok=True)
208
 
 
238
  reader = RemoteLogReader()
239
 
240
  # Add argument parser for optional parameters
241
+ parser = argparse.ArgumentParser(description='Download and organize conversation files by annotator name')
242
  parser.add_argument('--sandbox-check', action='store_true', help='Check for matching sandbox logs')
243
  parser.add_argument('--download', action='store_true', help='Enable file download')
244
  args = parser.parse_args()
245
 
246
  # Download files if enabled
247
  if args.download:
248
+ print("\nDownloading files and organizing by annotator name...")
249
+ download_files_by_name(reader, check_sandbox=args.sandbox_check)
250
 
251
  # Count and display statistics
252
+ name_counts = count_files_per_ip(reader)
253
+ print("\nFile counts per annotator:")
254
+ for name, count in sorted(name_counts.items(), key=lambda x: x[1], reverse=True):
255
+ print(f"Name: {name:<20} Count: {count}")
256
 
257
  if __name__ == "__main__":
258
  main()