Spaces:
Build error
Build error
data analysis notebooks
Browse files
llm_toolkit/translation_utils.py
CHANGED
@@ -163,6 +163,24 @@ def load_translation_dataset(data_path, tokenizer=None):
|
|
163 |
return datasets
|
164 |
|
165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
def get_metrics(df, max_output_tokens=2048):
|
167 |
metrics_df = pd.DataFrame(df.columns.T)[2:]
|
168 |
metrics_df.rename(columns={0: "model"}, inplace=True)
|
@@ -203,15 +221,15 @@ def get_metrics(df, max_output_tokens=2048):
|
|
203 |
)
|
204 |
|
205 |
num_entries_with_max_output_tokens.append(
|
206 |
-
df["output_tokens"]
|
207 |
)
|
208 |
|
209 |
metrics_df["meteor"] = meteor
|
210 |
metrics_df["bleu_1"] = bleu_1
|
211 |
metrics_df["rouge_l"] = rouge_l
|
212 |
metrics_df["ews_score"] = ews_score
|
213 |
-
metrics_df["repetition_score"] =
|
214 |
-
metrics_df["total_repetitions"] =
|
215 |
metrics_df["num_entries_with_max_output_tokens"] = (
|
216 |
num_entries_with_max_output_tokens
|
217 |
)
|
|
|
163 |
return datasets
|
164 |
|
165 |
|
166 |
+
def count_entries_with_max_tokens(entries, max_tokens):
|
167 |
+
"""
|
168 |
+
Count the number of entries with the max output tokens or more.
|
169 |
+
|
170 |
+
Parameters:
|
171 |
+
entries (list of int): List of token counts for each entry.
|
172 |
+
max_tokens (int): The maximum token threshold.
|
173 |
+
|
174 |
+
Returns:
|
175 |
+
int: The number of entries with token counts greater than or equal to max_tokens.
|
176 |
+
"""
|
177 |
+
count = 0
|
178 |
+
for tokens in entries:
|
179 |
+
if tokens >= max_tokens:
|
180 |
+
count += 1
|
181 |
+
return count
|
182 |
+
|
183 |
+
|
184 |
def get_metrics(df, max_output_tokens=2048):
|
185 |
metrics_df = pd.DataFrame(df.columns.T)[2:]
|
186 |
metrics_df.rename(columns={0: "model"}, inplace=True)
|
|
|
221 |
)
|
222 |
|
223 |
num_entries_with_max_output_tokens.append(
|
224 |
+
count_entries_with_max_tokens(df["output_tokens"], max_output_tokens)
|
225 |
)
|
226 |
|
227 |
metrics_df["meteor"] = meteor
|
228 |
metrics_df["bleu_1"] = bleu_1
|
229 |
metrics_df["rouge_l"] = rouge_l
|
230 |
metrics_df["ews_score"] = ews_score
|
231 |
+
metrics_df["repetition_score"] = repetition_score
|
232 |
+
metrics_df["total_repetitions"] = total_repetitions
|
233 |
metrics_df["num_entries_with_max_output_tokens"] = (
|
234 |
num_entries_with_max_output_tokens
|
235 |
)
|
notebooks/00_Data Analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/00a_Data Analysis_greedy_decoding.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|