marinone94
commited on
Commit
·
09dc80f
1
Parent(s):
044dff6
train only on nst
Browse files- run.sh +3 -3
- run_speech_recognition_ctc.py +10 -3
run.sh
CHANGED
@@ -2,10 +2,10 @@ python run_speech_recognition_ctc.py \
|
|
2 |
--dataset_name="mozilla-foundation/common_voice_7_0,marinone94/nst_sv" \
|
3 |
--model_name_or_path="KBLab/wav2vec2-large-voxrex" \
|
4 |
--dataset_config_name="sv-SE,distant_channel" \
|
5 |
-
--train_split_name="
|
6 |
--eval_split_name="test,None" \
|
7 |
--output_dir="./" \
|
8 |
-
--
|
9 |
--num_train_epochs="3" \
|
10 |
--per_device_train_batch_size="32" \
|
11 |
--per_device_eval_batch_size="32" \
|
@@ -19,7 +19,7 @@ python run_speech_recognition_ctc.py \
|
|
19 |
--save_steps="100" \
|
20 |
--text_column_name="sentence" \
|
21 |
--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
|
22 |
-
--logging_steps="
|
23 |
--dataset_seed="42" \
|
24 |
--layerdrop="0.0" \
|
25 |
--activation_dropout="0.1" \
|
|
|
2 |
--dataset_name="mozilla-foundation/common_voice_7_0,marinone94/nst_sv" \
|
3 |
--model_name_or_path="KBLab/wav2vec2-large-voxrex" \
|
4 |
--dataset_config_name="sv-SE,distant_channel" \
|
5 |
+
--train_split_name="None,train" \
|
6 |
--eval_split_name="test,None" \
|
7 |
--output_dir="./" \
|
8 |
+
--overwrite_output_dir \
|
9 |
--num_train_epochs="3" \
|
10 |
--per_device_train_batch_size="32" \
|
11 |
--per_device_eval_batch_size="32" \
|
|
|
19 |
--save_steps="100" \
|
20 |
--text_column_name="sentence" \
|
21 |
--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
|
22 |
+
--logging_steps="20" \
|
23 |
--dataset_seed="42" \
|
24 |
--layerdrop="0.0" \
|
25 |
--activation_dropout="0.1" \
|
run_speech_recognition_ctc.py
CHANGED
@@ -371,10 +371,12 @@ def main():
|
|
371 |
# TODO: Replace with check of wandb env vars
|
372 |
try:
|
373 |
repo_name = os.getcwd().split("/")[-1]
|
|
|
374 |
os.environ["WANDB_PROJECT"] = repo_name
|
375 |
wandb.login()
|
376 |
training_args.report_to = ["wandb"]
|
377 |
-
training_args.run_name =
|
|
|
378 |
except:
|
379 |
pass
|
380 |
|
@@ -544,6 +546,7 @@ def main():
|
|
544 |
.replace("î", "i") \
|
545 |
.replace("ü", "u") \
|
546 |
.replace("ÿ", "y") \
|
|
|
547 |
.replace("\\", "") \
|
548 |
.replace("/", "") \
|
549 |
.replace("|", "") \
|
@@ -557,6 +560,7 @@ def main():
|
|
557 |
.replace("î", "i") \
|
558 |
.replace("ü", "u") \
|
559 |
.replace("ÿ", "y") \
|
|
|
560 |
.replace("\\", "") \
|
561 |
.replace("/", "") \
|
562 |
.replace("|", "") \
|
@@ -754,8 +758,11 @@ def main():
|
|
754 |
# Log sample of datasets
|
755 |
pd_train = vectorized_datasets["train"].select(range(10)).to_pandas()
|
756 |
pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
|
757 |
-
wandb.log({"train_sample": pd_train})
|
758 |
-
wandb.log("eval_sample": pd_eval)
|
|
|
|
|
|
|
759 |
|
760 |
# for large datasets it is advised to run the preprocessing on a
|
761 |
# single machine first with ``args.preprocessing_only`` since there will mostly likely
|
|
|
371 |
# TODO: Replace with check of wandb env vars
|
372 |
try:
|
373 |
repo_name = os.getcwd().split("/")[-1]
|
374 |
+
run_name = f"{datetime.datetime.utcnow()}".replace(" ", "T")
|
375 |
os.environ["WANDB_PROJECT"] = repo_name
|
376 |
wandb.login()
|
377 |
training_args.report_to = ["wandb"]
|
378 |
+
training_args.run_name = run_name
|
379 |
+
wandb.init()
|
380 |
except:
|
381 |
pass
|
382 |
|
|
|
546 |
.replace("î", "i") \
|
547 |
.replace("ü", "u") \
|
548 |
.replace("ÿ", "y") \
|
549 |
+
.replace("ô", "o") \
|
550 |
.replace("\\", "") \
|
551 |
.replace("/", "") \
|
552 |
.replace("|", "") \
|
|
|
560 |
.replace("î", "i") \
|
561 |
.replace("ü", "u") \
|
562 |
.replace("ÿ", "y") \
|
563 |
+
.replace("ô", "o") \
|
564 |
.replace("\\", "") \
|
565 |
.replace("/", "") \
|
566 |
.replace("|", "") \
|
|
|
758 |
# Log sample of datasets
|
759 |
pd_train = vectorized_datasets["train"].select(range(10)).to_pandas()
|
760 |
pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
|
761 |
+
# wandb.log({"train_sample": pd_train})
|
762 |
+
# wandb.log({"eval_sample": pd_eval})
|
763 |
+
|
764 |
+
print(pd_train)
|
765 |
+
print(pd_eval)
|
766 |
|
767 |
# for large datasets it is advised to run the preprocessing on a
|
768 |
# single machine first with ``args.preprocessing_only`` since there will mostly likely
|