Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -16,15 +16,15 @@ app.add_middleware(
|
|
16 |
)
|
17 |
|
18 |
# Load the model and tokenizer
|
19 |
-
tokenizer = AutoTokenizer.from_pretrained("EzekielMW/
|
20 |
-
model = AutoModelForSeq2SeqLM.from_pretrained("EzekielMW/
|
21 |
|
|
|
22 |
# Where should output files be stored locally
|
23 |
drive_folder = "./serverlogs"
|
24 |
|
25 |
if not os.path.exists(drive_folder):
|
26 |
-
|
27 |
-
|
28 |
|
29 |
# Large batch sizes generally give good results for translation
|
30 |
effective_train_batch_size = 480
|
@@ -38,8 +38,8 @@ yaml_config = '''
|
|
38 |
training_args:
|
39 |
output_dir: "{drive_folder}"
|
40 |
eval_strategy: steps
|
41 |
-
eval_steps:
|
42 |
-
save_steps:
|
43 |
gradient_accumulation_steps: {gradient_accumulation_steps}
|
44 |
learning_rate: 3.0e-4 # Include decimal point to parse as float
|
45 |
# optim: adafactor
|
@@ -64,24 +64,25 @@ data_dir: .
|
|
64 |
# Use a 600M parameter model here, which is easier to train on a free Colab
|
65 |
# instance. Bigger models work better, however: results will be improved
|
66 |
# if able to train on nllb-200-1.3B instead.
|
67 |
-
model_checkpoint: facebook/nllb-200-
|
68 |
|
69 |
datasets:
|
70 |
train:
|
71 |
huggingface_load:
|
72 |
# We will load two datasets here: English/KSL Gloss, and also SALT
|
73 |
# Swahili/English, so that we can try out multi-way translation.
|
74 |
-
|
75 |
- path: EzekielMW/Eksl_dataset
|
76 |
split: train[:-1000]
|
|
|
|
|
77 |
- path: sunbird/salt
|
78 |
name: text-all
|
79 |
split: train
|
80 |
source:
|
81 |
# This is a text translation only, no audio.
|
82 |
type: text
|
83 |
-
# The source text can be any of English, KSL or
|
84 |
-
language: [eng,ksl,swa]
|
85 |
preprocessing:
|
86 |
# The models are case sensitive, so if the training text is all
|
87 |
# capitals, then it will only learn to translate capital letters and
|
@@ -92,8 +93,8 @@ datasets:
|
|
92 |
- augment_characters
|
93 |
target:
|
94 |
type: text
|
95 |
-
# The target text with any of English, KSL or
|
96 |
-
language: [eng,ksl,swa]
|
97 |
# The models are case sensitive: make everything lower case for now.
|
98 |
preprocessing:
|
99 |
- lower_case
|
@@ -103,21 +104,24 @@ datasets:
|
|
103 |
|
104 |
validation:
|
105 |
huggingface_load:
|
106 |
-
# Use the last
|
107 |
- path: EzekielMW/Eksl_dataset
|
108 |
split: train[-1000:]
|
|
|
|
|
|
|
109 |
# Add some Swahili validation text.
|
110 |
- path: sunbird/salt
|
111 |
name: text-all
|
112 |
split: dev
|
113 |
source:
|
114 |
type: text
|
115 |
-
language: [swa,ksl,eng]
|
116 |
preprocessing:
|
117 |
- lower_case
|
118 |
target:
|
119 |
type: text
|
120 |
-
language: [swa,ksl,eng]
|
121 |
preprocessing:
|
122 |
- lower_case
|
123 |
allow_same_src_and_tgt_language: False
|
@@ -134,10 +138,11 @@ config = yaml.safe_load(yaml_config)
|
|
134 |
|
135 |
training_settings = transformers.Seq2SeqTrainingArguments(
|
136 |
**config["training_args"])
|
|
|
137 |
# The pre-trained model that we use has support for some African languages, but
|
138 |
# we need to adapt the tokenizer to languages that it wasn't trained with,
|
139 |
# such as KSL. Here we reuse the token from a different language.
|
140 |
-
LANGUAGE_CODES = ["eng", "swa", "ksl"]
|
141 |
|
142 |
code_mapping = {
|
143 |
# Exact/close mapping
|
@@ -145,6 +150,7 @@ code_mapping = {
|
|
145 |
'swa': 'swh_Latn',
|
146 |
# Random mapping
|
147 |
'ksl': 'ace_Latn',
|
|
|
148 |
}
|
149 |
tokenizer = transformers.NllbTokenizer.from_pretrained(
|
150 |
config['model_checkpoint'],
|
@@ -155,7 +161,9 @@ offset = tokenizer.sp_model_size + tokenizer.fairseq_offset
|
|
155 |
|
156 |
for code in LANGUAGE_CODES:
|
157 |
i = tokenizer.convert_tokens_to_ids(code_mapping[code])
|
158 |
-
tokenizer._added_tokens_encoder[code] =
|
|
|
|
|
159 |
|
160 |
# Define a translation function
|
161 |
def translate(text, source_language, target_language):
|
|
|
16 |
)
|
17 |
|
18 |
# Load the model and tokenizer
|
19 |
+
tokenizer = AutoTokenizer.from_pretrained("EzekielMW/LuoKslGloss")
|
20 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("EzekielMW/LuoKslGloss")
|
21 |
|
22 |
+
# Where should output files be stored locally
|
23 |
# Where should output files be stored locally
|
24 |
drive_folder = "./serverlogs"
|
25 |
|
26 |
if not os.path.exists(drive_folder):
|
27 |
+
%mkdir $drive_folder
|
|
|
28 |
|
29 |
# Large batch sizes generally give good results for translation
|
30 |
effective_train_batch_size = 480
|
|
|
38 |
training_args:
|
39 |
output_dir: "{drive_folder}"
|
40 |
eval_strategy: steps
|
41 |
+
eval_steps: 200
|
42 |
+
save_steps: 200
|
43 |
gradient_accumulation_steps: {gradient_accumulation_steps}
|
44 |
learning_rate: 3.0e-4 # Include decimal point to parse as float
|
45 |
# optim: adafactor
|
|
|
64 |
# Use a 600M parameter model here, which is easier to train on a free Colab
|
65 |
# instance. Bigger models work better, however: results will be improved
|
66 |
# if able to train on nllb-200-1.3B instead.
|
67 |
+
model_checkpoint: facebook/nllb-200-1.3B
|
68 |
|
69 |
datasets:
|
70 |
train:
|
71 |
huggingface_load:
|
72 |
# We will load two datasets here: English/KSL Gloss, and also SALT
|
73 |
# Swahili/English, so that we can try out multi-way translation.
|
|
|
74 |
- path: EzekielMW/Eksl_dataset
|
75 |
split: train[:-1000]
|
76 |
+
- path: EzekielMW/Luo_Swa
|
77 |
+
split: train[:-2000]
|
78 |
- path: sunbird/salt
|
79 |
name: text-all
|
80 |
split: train
|
81 |
source:
|
82 |
# This is a text translation only, no audio.
|
83 |
type: text
|
84 |
+
# The source text can be any of English, KSL, Swahili or Dholuo.
|
85 |
+
language: [eng,ksl,swa,luo]
|
86 |
preprocessing:
|
87 |
# The models are case sensitive, so if the training text is all
|
88 |
# capitals, then it will only learn to translate capital letters and
|
|
|
93 |
- augment_characters
|
94 |
target:
|
95 |
type: text
|
96 |
+
# The target text with any of English, KSL, Swahili or Dholuo.
|
97 |
+
language: [eng,ksl,swa,luo]
|
98 |
# The models are case sensitive: make everything lower case for now.
|
99 |
preprocessing:
|
100 |
- lower_case
|
|
|
104 |
|
105 |
validation:
|
106 |
huggingface_load:
|
107 |
+
# Use the last 1000 of the KSL examples for validation.
|
108 |
- path: EzekielMW/Eksl_dataset
|
109 |
split: train[-1000:]
|
110 |
+
# Use the last 2000 of the Luo examples for validation.
|
111 |
+
- path: EzekielMW/Luo_Swa
|
112 |
+
split: train[-2000:]
|
113 |
# Add some Swahili validation text.
|
114 |
- path: sunbird/salt
|
115 |
name: text-all
|
116 |
split: dev
|
117 |
source:
|
118 |
type: text
|
119 |
+
language: [swa,ksl,eng,luo]
|
120 |
preprocessing:
|
121 |
- lower_case
|
122 |
target:
|
123 |
type: text
|
124 |
+
language: [swa,ksl,eng,luo]
|
125 |
preprocessing:
|
126 |
- lower_case
|
127 |
allow_same_src_and_tgt_language: False
|
|
|
138 |
|
139 |
training_settings = transformers.Seq2SeqTrainingArguments(
|
140 |
**config["training_args"])
|
141 |
+
|
142 |
# The pre-trained model that we use has support for some African languages, but
|
143 |
# we need to adapt the tokenizer to languages that it wasn't trained with,
|
144 |
# such as KSL. Here we reuse the token from a different language.
|
145 |
+
LANGUAGE_CODES = ["eng", "swa", "ksl","luo"]
|
146 |
|
147 |
code_mapping = {
|
148 |
# Exact/close mapping
|
|
|
150 |
'swa': 'swh_Latn',
|
151 |
# Random mapping
|
152 |
'ksl': 'ace_Latn',
|
153 |
+
'luo': 'luo_Latn',
|
154 |
}
|
155 |
tokenizer = transformers.NllbTokenizer.from_pretrained(
|
156 |
config['model_checkpoint'],
|
|
|
161 |
|
162 |
for code in LANGUAGE_CODES:
|
163 |
i = tokenizer.convert_tokens_to_ids(code_mapping[code])
|
164 |
+
tokenizer._added_tokens_encoder[code] = I
|
165 |
+
|
166 |
+
transformers.generation.utils.ForcedBOSTokenLogitsProcessor = transformers.ForcedBOSTokenLogitsProcessor
|
167 |
|
168 |
# Define a translation function
|
169 |
def translate(text, source_language, target_language):
|