HoneyTian commited on
Commit
92a4906
·
1 Parent(s): 1e55fa2
data/eval_data/google_anthropic/anthropic/claude-opus-4@20250514/shenzhen_sase/google_nxcloud_312303/20250801_172915/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3724e70b6b1760410a59daa8c3f1b3071aab647f060809bc74340417472ac529
3
+ size 266306
data/eval_data/google_anthropic/anthropic/claude-opus-4@20250514/shenzhen_sase/google_nxcloud_312303/20250801_172915/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28fa6632e2db64e19c4ce01100c691d4fcfd72614caeec1a6731bd3b519983b0
3
+ size 1466404
data/eval_data/google_anthropic/anthropic/claude-sonnet-4@20250514/shenzhen_sase/google_nxcloud_312303/20250801_164202/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b5053f79a7991bbeda307d9759e68e0069c9b595059588bdc343c3d44ef30c8
3
+ size 258473
data/eval_data/google_anthropic/anthropic/claude-sonnet-4@20250514/shenzhen_sase/google_nxcloud_312303/20250801_164202/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ecbb9c1e967d0e748f8e3cc5e46fa970bb28213439584eb4723123490c62553
3
+ size 1211251
llm_eval_script/gemini_google.py CHANGED
@@ -47,9 +47,9 @@ def get_args():
47
  # default="gemini-2.5-flash",
48
  # default="gemini-2.5-flash-lite-preview-06-17",
49
  # default="claude-opus-4@20250514",
50
- # default="claude-sonnet-4@20250514",
51
  # default="llama-4-maverick-17b-128e-instruct-maas",
52
- default="llama-4-scout-17b-16e-instruct-maas",
53
  type=str
54
  )
55
  parser.add_argument(
@@ -76,13 +76,14 @@ def get_args():
76
  )
77
  parser.add_argument(
78
  "--service",
79
- default="google_potent_veld_462405_t3",
 
80
  type=str
81
  )
82
  parser.add_argument(
83
  "--create_time_str",
84
- # default="null",
85
- default="20250731_162116",
86
  type=str
87
  )
88
  parser.add_argument(
 
47
  # default="gemini-2.5-flash",
48
  # default="gemini-2.5-flash-lite-preview-06-17",
49
  # default="claude-opus-4@20250514",
50
+ default="claude-sonnet-4@20250514",
51
  # default="llama-4-maverick-17b-128e-instruct-maas",
52
+ # default="llama-4-scout-17b-16e-instruct-maas",
53
  type=str
54
  )
55
  parser.add_argument(
 
76
  )
77
  parser.add_argument(
78
  "--service",
79
+ # default="google_potent_veld_462405_t3",
80
+ default="google_nxcloud_312303",
81
  type=str
82
  )
83
  parser.add_argument(
84
  "--create_time_str",
85
+ default="null",
86
+ # default="20250731_162116",
87
  type=str
88
  )
89
  parser.add_argument(
llm_eval_script/google_anthropic.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/claude/use-claude?hl=zh-cn
5
+ """
6
+ import argparse
7
+ from datetime import datetime
8
+ import json
9
+ import os
10
+ from pathlib import Path
11
+ import sys
12
+ import time
13
+ import tempfile
14
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
15
+
16
+ pwd = os.path.abspath(os.path.dirname(__file__))
17
+ sys.path.append(os.path.join(pwd, "../"))
18
+
19
+ from google import genai
20
+ from google.genai import types
21
+ from anthropic import AnthropicVertex
22
+
23
+ from project_settings import environment, project_path
24
+
25
+
26
+ def get_args():
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument(
29
+ "--model_name",
30
+ default="claude-opus-4@20250514",
31
+ # default="claude-sonnet-4@20250514",
32
+ type=str
33
+ )
34
+ parser.add_argument(
35
+ "--eval_dataset_name",
36
+ # default="agent-bingoplus-ph-90-choice.jsonl",
37
+ default="agent-lingoace-zh-400-choice.jsonl",
38
+ # default="arc-easy-1000-choice.jsonl",
39
+ type=str
40
+ )
41
+ parser.add_argument(
42
+ "--eval_dataset_dir",
43
+ default=(project_path / "data/dataset").as_posix(),
44
+ type=str
45
+ )
46
+ parser.add_argument(
47
+ "--eval_data_dir",
48
+ default=(project_path / "data/eval_data").as_posix(),
49
+ type=str
50
+ )
51
+ parser.add_argument(
52
+ "--client",
53
+ default="shenzhen_sase",
54
+ type=str
55
+ )
56
+ parser.add_argument(
57
+ "--service",
58
+ # default="google_potent_veld_462405_t3",
59
+ default="google_nxcloud_312303",
60
+ type=str
61
+ )
62
+ parser.add_argument(
63
+ "--create_time_str",
64
+ default="null",
65
+ # default="20250731_162116",
66
+ type=str
67
+ )
68
+ parser.add_argument(
69
+ "--interval",
70
+ default=1,
71
+ type=int
72
+ )
73
+ args = parser.parse_args()
74
+ return args
75
+
76
+
77
+ def main():
78
+ args = get_args()
79
+
80
+ service = environment.get(args.service, dtype=json.loads)
81
+ project_id = service["project_id"]
82
+
83
+ google_application_credentials = Path(tempfile.gettempdir()) / f"llm_eval_system/{project_id}.json"
84
+ google_application_credentials.parent.mkdir(parents=True, exist_ok=True)
85
+
86
+ with open(google_application_credentials.as_posix(), "w", encoding="utf-8") as f:
87
+ content = json.dumps(service, ensure_ascii=False, indent=4)
88
+ f.write(f"{content}\n")
89
+
90
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_application_credentials.as_posix()
91
+
92
+ eval_dataset_dir = Path(args.eval_dataset_dir)
93
+ eval_dataset_dir.mkdir(parents=True, exist_ok=True)
94
+ eval_data_dir = Path(args.eval_data_dir)
95
+ eval_data_dir.mkdir(parents=True, exist_ok=True)
96
+
97
+ if args.create_time_str == "null":
98
+ tz = ZoneInfo("Asia/Shanghai")
99
+ now = datetime.now(tz)
100
+ create_time_str = now.strftime("%Y%m%d_%H%M%S")
101
+ # create_time_str = "20250729-interval-5"
102
+ else:
103
+ create_time_str = args.create_time_str
104
+
105
+ eval_dataset = eval_dataset_dir / args.eval_dataset_name
106
+
107
+ output_file = eval_data_dir / f"google_anthropic/anthropic/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
108
+ output_file.parent.mkdir(parents=True, exist_ok=True)
109
+
110
+ client = AnthropicVertex(project_id=project_id, region="us-east5")
111
+
112
+ total = 0
113
+ total_correct = 0
114
+
115
+ # finished
116
+ finished_idx_set = set()
117
+ if os.path.exists(output_file.as_posix()):
118
+ with open(output_file.as_posix(), "r", encoding="utf-8") as f:
119
+ for row in f:
120
+ row = json.loads(row)
121
+ idx = row["idx"]
122
+ total = row["total"]
123
+ total_correct = row["total_correct"]
124
+ finished_idx_set.add(idx)
125
+ print(f"finished count: {len(finished_idx_set)}")
126
+
127
+ with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
128
+ for row in fin:
129
+ row = json.loads(row)
130
+ idx = row["idx"]
131
+ prompt = row["prompt"]
132
+ response = row["response"]
133
+
134
+ if idx in finished_idx_set:
135
+ continue
136
+ finished_idx_set.add(idx)
137
+
138
+ try:
139
+ time.sleep(args.interval)
140
+ print(f"sleep: {args.interval}")
141
+ time_begin = time.time()
142
+ message = client.messages.create(
143
+ model=args.model_name,
144
+ max_tokens=1024,
145
+ messages=[
146
+ {
147
+ "role": "user",
148
+ "content": prompt,
149
+ }
150
+ ],
151
+ )
152
+ time_cost = time.time() - time_begin
153
+ print(f"time_cost: {time_cost}")
154
+ except Exception as e:
155
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
156
+ continue
157
+
158
+ prediction = message.content[0].text
159
+
160
+ correct = 1 if prediction == response else 0
161
+
162
+ total += 1
163
+ total_correct += correct
164
+ score = total_correct / total
165
+
166
+ row_ = {
167
+ "idx": idx,
168
+ "prompt": prompt,
169
+ "response": response,
170
+ "prediction": prediction,
171
+ "correct": correct,
172
+ "total": total,
173
+ "total_correct": total_correct,
174
+ "score": score,
175
+ "time_cost": time_cost,
176
+ }
177
+ row_ = json.dumps(row_, ensure_ascii=False)
178
+ fout.write(f"{row_}\n")
179
+ fout.flush()
180
+
181
+ return
182
+
183
+
184
+ if __name__ == "__main__":
185
+ main()
requirements.txt CHANGED
@@ -8,3 +8,4 @@ openai
8
  smithy-aws-core>=0.0.1
9
  aws_sdk_bedrock_runtime
10
  boto3
 
 
8
  smithy-aws-core>=0.0.1
9
  aws_sdk_bedrock_runtime
10
  boto3
11
+ anthropic