dh-mc commited on
Commit
1a9edc9
·
1 Parent(s): 3f25ae5

data analysis notebooks

Browse files
llm_toolkit/translation_utils.py CHANGED
@@ -163,6 +163,24 @@ def load_translation_dataset(data_path, tokenizer=None):
163
  return datasets
164
 
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  def get_metrics(df, max_output_tokens=2048):
167
  metrics_df = pd.DataFrame(df.columns.T)[2:]
168
  metrics_df.rename(columns={0: "model"}, inplace=True)
@@ -203,15 +221,15 @@ def get_metrics(df, max_output_tokens=2048):
203
  )
204
 
205
  num_entries_with_max_output_tokens.append(
206
- df["output_tokens"].value_counts().get(max_output_tokens, 0)
207
  )
208
 
209
  metrics_df["meteor"] = meteor
210
  metrics_df["bleu_1"] = bleu_1
211
  metrics_df["rouge_l"] = rouge_l
212
  metrics_df["ews_score"] = ews_score
213
- metrics_df["repetition_score"] = ews_score
214
- metrics_df["total_repetitions"] = ews_score
215
  metrics_df["num_entries_with_max_output_tokens"] = (
216
  num_entries_with_max_output_tokens
217
  )
 
163
  return datasets
164
 
165
 
166
+ def count_entries_with_max_tokens(entries, max_tokens):
167
+ """
168
+ Count the number of entries with the max output tokens or more.
169
+
170
+ Parameters:
171
+ entries (list of int): List of token counts for each entry.
172
+ max_tokens (int): The maximum token threshold.
173
+
174
+ Returns:
175
+ int: The number of entries with token counts greater than or equal to max_tokens.
176
+ """
177
+ count = 0
178
+ for tokens in entries:
179
+ if tokens >= max_tokens:
180
+ count += 1
181
+ return count
182
+
183
+
184
  def get_metrics(df, max_output_tokens=2048):
185
  metrics_df = pd.DataFrame(df.columns.T)[2:]
186
  metrics_df.rename(columns={0: "model"}, inplace=True)
 
221
  )
222
 
223
  num_entries_with_max_output_tokens.append(
224
+ count_entries_with_max_tokens(df["output_tokens"], max_output_tokens)
225
  )
226
 
227
  metrics_df["meteor"] = meteor
228
  metrics_df["bleu_1"] = bleu_1
229
  metrics_df["rouge_l"] = rouge_l
230
  metrics_df["ews_score"] = ews_score
231
+ metrics_df["repetition_score"] = repetition_score
232
+ metrics_df["total_repetitions"] = total_repetitions
233
  metrics_df["num_entries_with_max_output_tokens"] = (
234
  num_entries_with_max_output_tokens
235
  )
notebooks/00_Data Analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/00a_Data Analysis_greedy_decoding.ipynb CHANGED
The diff for this file is too large to render. See raw diff