Terry Zhuo
commited on
Commit
·
b406d3a
1
Parent(s):
3f2364a
update
Browse files- azure_count_ip_data.py +47 -35
azure_count_ip_data.py
CHANGED
@@ -10,22 +10,26 @@ from typing import Dict, Set, Tuple, Optional
|
|
10 |
from log_reader import RemoteLogReader
|
11 |
|
12 |
# List of IP addresses we care about
|
13 |
-
|
14 |
-
"199.111.212.5",
|
15 |
-
"175.159.122.63",
|
16 |
-
"109.245.193.97",
|
17 |
-
"158.195.18.232",
|
18 |
-
"2607:fea8:4f40:4b00:e5b9:9806:6b69:233b",
|
19 |
-
"66.254.231.49",
|
20 |
-
"129.74.154.194",
|
21 |
-
"175.196.44.217",
|
22 |
-
"2601:600:8d00:9510:1d77:b610:9358:f443",
|
23 |
-
"74.90.222.68",
|
24 |
-
"
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"
|
28 |
-
]
|
|
|
|
|
|
|
|
|
29 |
|
30 |
logging.basicConfig(level=logging.WARNING)
|
31 |
log = logging.getLogger(__name__)
|
@@ -108,16 +112,19 @@ def get_file_data(content: str) -> Tuple[Optional[str], bool]:
|
|
108 |
return None, False
|
109 |
|
110 |
def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18") -> Dict[str, int]:
|
111 |
-
"""Count files per
|
112 |
# Convert start date string to datetime
|
113 |
start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
|
114 |
-
|
115 |
|
116 |
try:
|
117 |
# Get current date for iteration
|
118 |
current_date = start_date
|
119 |
today = datetime.now()
|
120 |
|
|
|
|
|
|
|
121 |
while current_date <= today:
|
122 |
date_str = current_date.strftime("%Y_%m_%d")
|
123 |
|
@@ -132,8 +139,9 @@ def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02
|
|
132 |
# Convert messages to file content format
|
133 |
content = '\n'.join(json.dumps(msg) for msg in messages)
|
134 |
ip, vote_conditions_met = get_file_data(content)
|
135 |
-
if vote_conditions_met and ip:
|
136 |
-
|
|
|
137 |
|
138 |
except Exception as e:
|
139 |
log.error(f"Error processing logs for date {date_str}: {e}")
|
@@ -144,10 +152,10 @@ def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02
|
|
144 |
except Exception as e:
|
145 |
log.error(f"Error accessing logs: {e}")
|
146 |
|
147 |
-
return dict(
|
148 |
|
149 |
-
def
|
150 |
-
"""Download files and organize them by
|
151 |
|
152 |
Args:
|
153 |
reader: RemoteLogReader instance
|
@@ -158,6 +166,9 @@ def download_files_by_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_
|
|
158 |
data_dir = os.path.join(os.getcwd(), "data")
|
159 |
os.makedirs(data_dir, exist_ok=True)
|
160 |
|
|
|
|
|
|
|
161 |
# Convert start date string to datetime
|
162 |
start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
|
163 |
|
@@ -186,11 +197,12 @@ def download_files_by_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_
|
|
186 |
content = '\n'.join(json.dumps(msg) for msg in messages)
|
187 |
ip = get_ip_from_jsonl(content)
|
188 |
|
189 |
-
if ip:
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
194 |
os.makedirs(valid_dir, exist_ok=True)
|
195 |
os.makedirs(invalid_dir, exist_ok=True)
|
196 |
|
@@ -226,21 +238,21 @@ def main():
|
|
226 |
reader = RemoteLogReader()
|
227 |
|
228 |
# Add argument parser for optional parameters
|
229 |
-
parser = argparse.ArgumentParser(description='Download and organize conversation files by
|
230 |
parser.add_argument('--sandbox-check', action='store_true', help='Check for matching sandbox logs')
|
231 |
parser.add_argument('--download', action='store_true', help='Enable file download')
|
232 |
args = parser.parse_args()
|
233 |
|
234 |
# Download files if enabled
|
235 |
if args.download:
|
236 |
-
print("\nDownloading files and organizing by
|
237 |
-
|
238 |
|
239 |
# Count and display statistics
|
240 |
-
|
241 |
-
print("\nFile counts per
|
242 |
-
for
|
243 |
-
print(f"
|
244 |
|
245 |
if __name__ == "__main__":
|
246 |
main()
|
|
|
10 |
from log_reader import RemoteLogReader
|
11 |
|
12 |
# List of IP addresses we care about
|
13 |
+
WHITELIST_IPS_DICT = {
|
14 |
+
"Chen Gong": ["199.111.212.5"],
|
15 |
+
"Juyong Jiang": ["175.159.122.63"],
|
16 |
+
"Kenneth Hamilton": ["109.245.193.97"],
|
17 |
+
"Marek Suppa": ["158.195.18.232"],
|
18 |
+
"Max Tian": ["2607:fea8:4f40:4b00:e5b9:9806:6b69:233b"],
|
19 |
+
"Mengzhao Jia": ["66.254.231.49"],
|
20 |
+
"Noah Ziems": ["129.74.154.194"],
|
21 |
+
"Sabina A": ["175.196.44.217"],
|
22 |
+
"Wenhao Yu": ["2601:600:8d00:9510:1d77:b610:9358:f443"],
|
23 |
+
"Vaisakhi Mishra": ["74.90.222.68"],
|
24 |
+
"Kumar Shridhar": ["129.132.145.250"],
|
25 |
+
"Viktor Gal": ["2a02:169:3e9:0:6ce8:e76f:faed:c830"],
|
26 |
+
"Guangyu Song": ["70.50.179.57"],
|
27 |
+
"Bhupesh Bishnoi": ["2a02:842a:24:5a01:8cd6:5b22:1189:6035"],
|
28 |
+
"Zheng Liu": ["2408:8418:6390:7603:40b:555f:774:a05d"]
|
29 |
+
}
|
30 |
+
|
31 |
+
# Flatten IP list for backward compatibility
|
32 |
+
WHITELIST_IPS = [ip for ips in WHITELIST_IPS_DICT.values() for ip in ips]
|
33 |
|
34 |
logging.basicConfig(level=logging.WARNING)
|
35 |
log = logging.getLogger(__name__)
|
|
|
112 |
return None, False
|
113 |
|
114 |
def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18") -> Dict[str, int]:
|
115 |
+
"""Count files per name from the given start date"""
|
116 |
# Convert start date string to datetime
|
117 |
start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
|
118 |
+
name_counts = defaultdict(int)
|
119 |
|
120 |
try:
|
121 |
# Get current date for iteration
|
122 |
current_date = start_date
|
123 |
today = datetime.now()
|
124 |
|
125 |
+
# Create reverse mapping of IP to name
|
126 |
+
ip_to_name = {ip: name for name, ips in WHITELIST_IPS_DICT.items() for ip in ips}
|
127 |
+
|
128 |
while current_date <= today:
|
129 |
date_str = current_date.strftime("%Y_%m_%d")
|
130 |
|
|
|
139 |
# Convert messages to file content format
|
140 |
content = '\n'.join(json.dumps(msg) for msg in messages)
|
141 |
ip, vote_conditions_met = get_file_data(content)
|
142 |
+
if vote_conditions_met and ip and ip in ip_to_name:
|
143 |
+
name = ip_to_name[ip]
|
144 |
+
name_counts[name] += 1
|
145 |
|
146 |
except Exception as e:
|
147 |
log.error(f"Error processing logs for date {date_str}: {e}")
|
|
|
152 |
except Exception as e:
|
153 |
log.error(f"Error accessing logs: {e}")
|
154 |
|
155 |
+
return dict(name_counts)
|
156 |
|
157 |
+
def download_files_by_name(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18", check_sandbox: bool = True) -> None:
|
158 |
+
"""Download files and organize them by annotator name
|
159 |
|
160 |
Args:
|
161 |
reader: RemoteLogReader instance
|
|
|
166 |
data_dir = os.path.join(os.getcwd(), "data")
|
167 |
os.makedirs(data_dir, exist_ok=True)
|
168 |
|
169 |
+
# Create reverse mapping of IP to name
|
170 |
+
ip_to_name = {ip: name for name, ips in WHITELIST_IPS_DICT.items() for ip in ips}
|
171 |
+
|
172 |
# Convert start date string to datetime
|
173 |
start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
|
174 |
|
|
|
197 |
content = '\n'.join(json.dumps(msg) for msg in messages)
|
198 |
ip = get_ip_from_jsonl(content)
|
199 |
|
200 |
+
if ip and ip in ip_to_name:
|
201 |
+
name = ip_to_name[ip]
|
202 |
+
# Create directory structure for this name
|
203 |
+
name_dir = os.path.join(data_dir, name)
|
204 |
+
valid_dir = os.path.join(name_dir, "valid")
|
205 |
+
invalid_dir = os.path.join(name_dir, "invalid")
|
206 |
os.makedirs(valid_dir, exist_ok=True)
|
207 |
os.makedirs(invalid_dir, exist_ok=True)
|
208 |
|
|
|
238 |
reader = RemoteLogReader()
|
239 |
|
240 |
# Add argument parser for optional parameters
|
241 |
+
parser = argparse.ArgumentParser(description='Download and organize conversation files by annotator name')
|
242 |
parser.add_argument('--sandbox-check', action='store_true', help='Check for matching sandbox logs')
|
243 |
parser.add_argument('--download', action='store_true', help='Enable file download')
|
244 |
args = parser.parse_args()
|
245 |
|
246 |
# Download files if enabled
|
247 |
if args.download:
|
248 |
+
print("\nDownloading files and organizing by annotator name...")
|
249 |
+
download_files_by_name(reader, check_sandbox=args.sandbox_check)
|
250 |
|
251 |
# Count and display statistics
|
252 |
+
name_counts = count_files_per_ip(reader)
|
253 |
+
print("\nFile counts per annotator:")
|
254 |
+
for name, count in sorted(name_counts.items(), key=lambda x: x[1], reverse=True):
|
255 |
+
print(f"Name: {name:<20} Count: {count}")
|
256 |
|
257 |
if __name__ == "__main__":
|
258 |
main()
|