dh-mc commited on
Commit
101f384
·
1 Parent(s): 243d523

few shot COMET results

Browse files
llm_toolkit/translation_utils.py CHANGED
@@ -249,12 +249,7 @@ def count_chinese_characters(text):
249
  return len(chinese_chars)
250
 
251
 
252
- def count_chinese_characters(text):
253
- chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]")
254
- return 1 if chinese_char_pattern.search(text) else 0
255
-
256
-
257
- def get_metrics(df, max_output_tokens=2048, variant="rpp"):
258
  metrics_df = pd.DataFrame(df.columns.T)[2:]
259
  metrics_df.rename(columns={0: "model"}, inplace=True)
260
  metrics_df[variant] = metrics_df["model"].apply(
@@ -272,6 +267,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
272
 
273
  tokenizers = {model: load_tokenizer(model) for model in models}
274
 
 
275
  meteor = []
276
  spbleu = []
277
  bleu_1 = []
@@ -295,11 +291,22 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
295
  df[new_col] = df["chinese"].apply(count_chinese_characters)
296
 
297
  for col in columns:
298
- metrics = calc_metrics(
299
- df["english"], df[col], sources=df["chinese"], debug=True
300
- )
 
 
 
 
 
 
 
 
 
 
301
  print(f"{col}: {metrics}")
302
 
 
303
  meteor.append(metrics["meteor"])
304
  spbleu.append(metrics["sacrebleu"]["score"])
305
  bleu_1.append(metrics["bleu_scores"]["bleu"])
@@ -332,6 +339,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
332
  count_entries_with_max_tokens(df[new_col], max_output_tokens)
333
  )
334
 
 
335
  metrics_df["meteor"] = meteor
336
  metrics_df["spbleu"] = spbleu
337
  metrics_df["bleu_1"] = bleu_1
@@ -340,7 +348,7 @@ def get_metrics(df, max_output_tokens=2048, variant="rpp"):
340
  metrics_df["repetition_score"] = repetition_score
341
  metrics_df["total_repetitions"] = total_repetitions
342
  metrics_df["rap"] = metrics_df.apply(
343
- lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
344
  )
345
 
346
  metrics_df["translation_completeness"] = translation_completeness
 
249
  return len(chinese_chars)
250
 
251
 
252
+ def get_metrics(df, max_output_tokens=2048, variant="rpp", existing_metrics_df=None):
 
 
 
 
 
253
  metrics_df = pd.DataFrame(df.columns.T)[2:]
254
  metrics_df.rename(columns={0: "model"}, inplace=True)
255
  metrics_df[variant] = metrics_df["model"].apply(
 
267
 
268
  tokenizers = {model: load_tokenizer(model) for model in models}
269
 
270
+ comet = []
271
  meteor = []
272
  spbleu = []
273
  bleu_1 = []
 
291
  df[new_col] = df["chinese"].apply(count_chinese_characters)
292
 
293
  for col in columns:
294
+ if existing_metrics_df is not None:
295
+ print(f"Using existing metrics for {col}")
296
+ parts = col.split(f"/{variant}-")
297
+ result = existing_metrics_df[
298
+ (existing_metrics_df["model"] == parts[0])
299
+ & (existing_metrics_df[variant] == int(parts[1]))
300
+ ]
301
+ metrics = result.to_dict("records")[0]
302
+ else:
303
+ print(f"Calculating metrics for {col}")
304
+ metrics = calc_metrics(
305
+ df["english"], df[col], sources=df["chinese"], debug=True
306
+ )
307
  print(f"{col}: {metrics}")
308
 
309
+ comet.append(metrics["comet"])
310
  meteor.append(metrics["meteor"])
311
  spbleu.append(metrics["sacrebleu"]["score"])
312
  bleu_1.append(metrics["bleu_scores"]["bleu"])
 
339
  count_entries_with_max_tokens(df[new_col], max_output_tokens)
340
  )
341
 
342
+ metrics_df["comet"] = comet
343
  metrics_df["meteor"] = meteor
344
  metrics_df["spbleu"] = spbleu
345
  metrics_df["bleu_1"] = bleu_1
 
348
  metrics_df["repetition_score"] = repetition_score
349
  metrics_df["total_repetitions"] = total_repetitions
350
  metrics_df["rap"] = metrics_df.apply(
351
+ lambda x: x["comet"] / math.log10(10 + x["total_repetitions"]), axis=1
352
  )
353
 
354
  metrics_df["translation_completeness"] = translation_completeness
llm_toolkit/translation_utils_v1.py DELETED
@@ -1,421 +0,0 @@
1
- import os
2
- import re
3
- import pandas as pd
4
- import evaluate
5
- import seaborn as sns
6
- import matplotlib.pyplot as plt
7
- from datasets import load_dataset
8
- from langchain_openai import ChatOpenAI
9
- from langchain_core.prompts import ChatPromptTemplate
10
- from tqdm import tqdm
11
- from eval_modules.calc_repetitions import *
12
- from llm_toolkit.llm_utils import load_tokenizer
13
-
14
- print(f"loading {__file__}")
15
-
16
- bleu = evaluate.load("bleu")
17
- rouge = evaluate.load("rouge")
18
- meteor = evaluate.load("meteor")
19
- accuracy = evaluate.load("accuracy")
20
-
21
-
22
- def extract_answer(text, debug=False):
23
- if text:
24
- # Remove the begin and end tokens
25
- text = re.sub(
26
- r".*?(assistant|\[/INST\]).+?\b", "", text, flags=re.DOTALL | re.MULTILINE
27
- )
28
- if debug:
29
- print("--------\nstep 1:", text)
30
-
31
- text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE)
32
- if debug:
33
- print("--------\nstep 2:", text)
34
-
35
- text = re.sub(
36
- r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE
37
- )
38
- if debug:
39
- print("--------\nstep 3:", text)
40
-
41
- return text
42
-
43
-
44
- def calc_metrics(references, predictions, debug=False):
45
- assert len(references) == len(
46
- predictions
47
- ), f"lengths are difference: {len(references)} != {len(predictions)}"
48
-
49
- predictions = [extract_answer(text) for text in predictions]
50
- results = {}
51
-
52
- results["meteor"] = meteor.compute(predictions=predictions, references=references)[
53
- "meteor"
54
- ]
55
-
56
- results["bleu_scores"] = bleu.compute(
57
- predictions=predictions, references=references, max_order=4
58
- )
59
- results["rouge_scores"] = rouge.compute(
60
- predictions=predictions, references=references
61
- )
62
-
63
- correct = [1 if ref == pred else 0 for ref, pred in zip(references, predictions)]
64
- accuracy = sum(correct) / len(references)
65
-
66
- results["accuracy"] = accuracy
67
- if debug:
68
- correct_ids = [i for i, c in enumerate(correct) if c == 1]
69
- results["correct_ids"] = correct_ids
70
-
71
- return results
72
-
73
-
74
- def save_results(model_name, results_path, dataset, predictions, debug=False):
75
- if not os.path.exists(results_path):
76
- # Get the directory part of the file path
77
- dir_path = os.path.dirname(results_path)
78
-
79
- # Create all directories in the path (if they don't exist)
80
- os.makedirs(dir_path, exist_ok=True)
81
- df = dataset.to_pandas()
82
- df.drop(columns=["text", "prompt"], inplace=True)
83
- else:
84
- df = pd.read_csv(results_path, on_bad_lines="warn")
85
-
86
- df[model_name] = predictions
87
-
88
- if debug:
89
- print(df.head(1))
90
-
91
- df.to_csv(results_path, index=False)
92
-
93
-
94
- def load_translation_dataset(data_path, tokenizer=None):
95
- train_data_file = data_path.replace(".tsv", "-train.tsv")
96
- test_data_file = data_path.replace(".tsv", "-test.tsv")
97
-
98
- if not os.path.exists(train_data_file):
99
- print("generating train/test data files")
100
- dataset = load_dataset(
101
- "csv", data_files=data_path, delimiter="\t", split="train"
102
- )
103
- print(len(dataset))
104
- dataset = dataset.filter(lambda x: x["chinese"] and x["english"])
105
-
106
- datasets = dataset.train_test_split(test_size=0.2)
107
- print(len(dataset))
108
-
109
- # Convert to pandas DataFrame
110
- train_df = pd.DataFrame(datasets["train"])
111
- test_df = pd.DataFrame(datasets["test"])
112
-
113
- # Save to TSV
114
- train_df.to_csv(train_data_file, sep="\t", index=False)
115
- test_df.to_csv(test_data_file, sep="\t", index=False)
116
-
117
- print("loading train/test data files")
118
- datasets = load_dataset(
119
- "csv",
120
- data_files={"train": train_data_file, "test": test_data_file},
121
- delimiter="\t",
122
- )
123
-
124
- if tokenizer:
125
- translation_prompt = "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{}"
126
-
127
- def formatting_prompts_func(examples):
128
- inputs = examples["chinese"]
129
- outputs = examples["english"]
130
-
131
- messages = [
132
- {
133
- "role": "system",
134
- "content": "You are an expert in translating Chinese to English.",
135
- },
136
- None,
137
- ]
138
-
139
- model_name = os.getenv("MODEL_NAME")
140
-
141
- # if "mistral" in model_name.lower():
142
- # messages = messages[1:]
143
-
144
- texts = []
145
- prompts = []
146
- for input, output in zip(inputs, outputs):
147
- prompt = translation_prompt.format(input)
148
- messages[-1] = {"role": "user", "content": prompt}
149
-
150
- prompt = tokenizer.apply_chat_template(
151
- messages, tokenize=False, add_generation_prompt=True
152
- )
153
- prompts.append(prompt)
154
- texts.append(prompt + output + tokenizer.eos_token)
155
- return {"text": texts, "prompt": prompts}
156
-
157
- datasets = datasets.map(
158
- formatting_prompts_func,
159
- batched=True,
160
- )
161
-
162
- print(datasets)
163
- return datasets
164
-
165
-
166
- def count_entries_with_max_tokens(entries, max_tokens):
167
- """
168
- Count the number of entries with the max output tokens or more.
169
-
170
- Parameters:
171
- entries (list of int): List of token counts for each entry.
172
- max_tokens (int): The maximum token threshold.
173
-
174
- Returns:
175
- int: The number of entries with token counts greater than or equal to max_tokens.
176
- """
177
- count = 0
178
- for tokens in entries:
179
- if tokens >= max_tokens:
180
- count += 1
181
- return count
182
-
183
-
184
- def detect_repetition_scores(row, col, debug=False):
185
- # print(f"row: {row}")
186
- newline_score, repetition_score, total_repetitions = detect_repetitions(
187
- row[col], debug=debug
188
- )
189
- newline_score -= row["ground_truth_ews_score"]
190
- repetition_score -= row["ground_truth_repetition_score"]
191
- total_repetitions -= row["ground_truth_total_repetitions"]
192
-
193
- return pd.Series(
194
- [
195
- newline_score if newline_score > 0 else 0,
196
- repetition_score if repetition_score > 0 else 0,
197
- total_repetitions if total_repetitions > 0 else 0,
198
- ]
199
- )
200
-
201
-
202
- def get_metrics(df, max_output_tokens=2048):
203
- metrics_df = pd.DataFrame(df.columns.T)[2:]
204
- metrics_df.rename(columns={0: "model"}, inplace=True)
205
- metrics_df["rpp"] = metrics_df["model"].apply(lambda x: x.split("rpp-")[-1])
206
- metrics_df["model"] = metrics_df["model"].apply(lambda x: x.split("/rpp-")[0])
207
- metrics_df.reset_index(inplace=True)
208
- metrics_df = metrics_df.drop(columns=["index"])
209
-
210
- tokenizers = {
211
- model: load_tokenizer(model) for model in metrics_df["model"].unique()
212
- }
213
-
214
- meteor = []
215
- bleu_1 = []
216
- rouge_l = []
217
- ews_score = []
218
- repetition_score = []
219
- total_repetitions = []
220
- num_max_output_tokens = []
221
- columns = df.columns[2:]
222
-
223
- df[
224
- [
225
- "ground_truth_ews_score",
226
- "ground_truth_repetition_score",
227
- "ground_truth_total_repetitions",
228
- ]
229
- ] = df["english"].apply(detect_scores)
230
-
231
- for col in columns:
232
- metrics = calc_metrics(df["english"], df[col], debug=True)
233
- print(f"{col}: {metrics}")
234
-
235
- meteor.append(metrics["meteor"])
236
- bleu_1.append(metrics["bleu_scores"]["bleu"])
237
- rouge_l.append(metrics["rouge_scores"]["rougeL"])
238
-
239
- df[["ews_score", "repetition_score", "total_repetitions"]] = df.apply(
240
- lambda x: detect_repetition_scores(x, col), axis=1
241
- )
242
- ews_score.append(df["ews_score"].mean())
243
- repetition_score.append(df["repetition_score"].mean())
244
- total_repetitions.append(df["total_repetitions"].mean())
245
-
246
- model = col.split("/rpp")[0]
247
-
248
- new_col = f"ground_truth_tokens-{model}"
249
- df[new_col] = df["english"].apply(
250
- lambda x: len(tokenizers[model](x)["input_ids"])
251
- )
252
-
253
- new_col = f"output_tokens-{col}"
254
- df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))
255
-
256
- num_max_output_tokens.append(
257
- count_entries_with_max_tokens(df[new_col], max_output_tokens)
258
- )
259
-
260
- metrics_df["meteor"] = meteor
261
- metrics_df["bleu_1"] = bleu_1
262
- metrics_df["rouge_l"] = rouge_l
263
- metrics_df["ews_score"] = ews_score
264
- metrics_df["repetition_score"] = repetition_score
265
- metrics_df["total_repetitions"] = total_repetitions
266
- metrics_df["rap"] = metrics_df.apply(
267
- lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
268
- )
269
-
270
- metrics_df["num_max_output_tokens"] = num_max_output_tokens
271
-
272
- return metrics_df
273
-
274
-
275
- def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
276
- plt.figure(figsize=figsize)
277
- df_melted = pd.melt(
278
- metrics_df, id_vars="model", value_vars=["meteor", "bleu_1", "rouge_l"]
279
- )
280
-
281
- barplot = sns.barplot(x="variable", y="value", hue="model", data=df_melted)
282
-
283
- # Set different hatches for each model
284
- hatches = ["/", "\\", "|", "-", "+", "x", "o", "O", ".", "*", "//", "\\\\"]
285
-
286
- # Create a dictionary to map models to hatches
287
- model_hatches = {
288
- model: hatches[i % len(hatches)]
289
- for i, model in enumerate(metrics_df["model"].unique())
290
- }
291
-
292
- # Apply hatches based on the model
293
- num_vars = len(df_melted["variable"].unique())
294
- for i, bar in enumerate(barplot.patches):
295
- model = df_melted["model"].iloc[i // num_vars]
296
- bar.set_hatch(model_hatches[model])
297
-
298
- # Manually update legend to match the bar hatches
299
- handles, labels = barplot.get_legend_handles_labels()
300
- for handle, model in zip(handles, metrics_df["model"].unique()):
301
- handle.set_hatch(model_hatches[model])
302
-
303
- barplot.set_xticklabels(["METEOR", "BLEU-1", "ROUGE-L"])
304
- for p in barplot.patches:
305
- if p.get_height() == 0:
306
- continue
307
- barplot.annotate(
308
- f"{p.get_height():.2f}",
309
- (p.get_x() + p.get_width() / 2.0, p.get_height()),
310
- ha="center",
311
- va="center",
312
- xytext=(0, 10),
313
- textcoords="offset points",
314
- )
315
-
316
- barplot.set(ylim=ylim, ylabel="Scores", xlabel="Metrics")
317
- plt.legend(bbox_to_anchor=(0.5, -0.1), loc="upper center")
318
- plt.show()
319
-
320
-
321
- def plot_times(perf_df, ylim=0.421):
322
- # Adjusted code to put "train-time" bars in red at the bottom
323
-
324
- fig, ax1 = plt.subplots(figsize=(12, 10))
325
-
326
- color_train = "tab:red"
327
- color_eval = "orange"
328
- ax1.set_xlabel("Models")
329
- ax1.set_ylabel("Time (mins)")
330
- ax1.set_xticks(range(len(perf_df["model"]))) # Set x-ticks positions
331
- ax1.set_xticklabels(perf_df["model"], rotation=90)
332
-
333
- # Plot "train-time" first so it's at the bottom
334
- ax1.bar(
335
- perf_df["model"],
336
- perf_df["train-time(mins)"],
337
- color=color_train,
338
- label="train-time",
339
- )
340
-
341
- # Then, plot "eval-time" on top of "train-time"
342
- ax1.bar(
343
- perf_df["model"],
344
- perf_df["eval-time(mins)"],
345
- bottom=perf_df["train-time(mins)"],
346
- color=color_eval,
347
- label="eval-time",
348
- )
349
-
350
- ax1.tick_params(axis="y")
351
- ax1.legend(loc="upper left")
352
-
353
- if "meteor" in perf_df.columns:
354
- ax2 = ax1.twinx()
355
- color_meteor = "tab:blue"
356
- ax2.set_ylabel("METEOR", color=color_meteor)
357
- ax2.plot(
358
- perf_df["model"],
359
- perf_df["meteor"],
360
- color=color_meteor,
361
- marker="o",
362
- label="meteor",
363
- )
364
- ax2.tick_params(axis="y", labelcolor=color_meteor)
365
- ax2.legend(loc="upper right")
366
- ax2.set_ylim(ax2.get_ylim()[0], ylim)
367
-
368
- # Show numbers in bars
369
- for p in ax1.patches:
370
- height = p.get_height()
371
- if height == 0: # Skip bars with height 0
372
- continue
373
- ax1.annotate(
374
- f"{height:.2f}",
375
- (p.get_x() + p.get_width() / 2.0, p.get_y() + height),
376
- ha="center",
377
- va="center",
378
- xytext=(0, -10),
379
- textcoords="offset points",
380
- )
381
-
382
- fig.tight_layout()
383
- plt.show()
384
-
385
-
386
- def translate_via_llm(text):
387
- base_url = os.getenv("OPENAI_BASE_URL") or "http://localhost:8000/v1"
388
- llm = ChatOpenAI(
389
- model="gpt-4o",
390
- temperature=0,
391
- max_tokens=None,
392
- timeout=None,
393
- max_retries=2,
394
- base_url=base_url,
395
- )
396
-
397
- prompt = ChatPromptTemplate.from_messages(
398
- [
399
- (
400
- "human",
401
- "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{input}",
402
- ),
403
- ]
404
- )
405
-
406
- chain = prompt | llm
407
- response = chain.invoke(
408
- {
409
- "input": text,
410
- }
411
- )
412
- return response.content
413
-
414
-
415
- def translate(text, cache_dict):
416
- if text in cache_dict:
417
- return cache_dict[text]
418
- else:
419
- translated_text = translate_via_llm(text)
420
- cache_dict[text] = translated_text
421
- return translated_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/00b_Data Analysis_Few_Shots.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
results/mac-results_few_shots_metrics.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36826dcad77fd126f2aec3afc445ad64ad9ccf086b0dd3c31685646d4ee57c42
3
- size 10540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f1a365cbe33bfd36ebae3cb08e0dc4e3c1fe5d2dfbf9f05ddb14df4e5842cd7
3
+ size 12417