EzekielMW commited on
Commit
019a11e
·
verified ·
1 Parent(s): 9d6e290

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -17
app.py CHANGED
@@ -16,15 +16,15 @@ app.add_middleware(
16
  )
17
 
18
  # Load the model and tokenizer
19
- tokenizer = AutoTokenizer.from_pretrained("EzekielMW/Eksl_dataset")
20
- model = AutoModelForSeq2SeqLM.from_pretrained("EzekielMW/Eksl_dataset")
21
 
 
22
  # Where should output files be stored locally
23
  drive_folder = "./serverlogs"
24
 
25
  if not os.path.exists(drive_folder):
26
- os.makedirs(drive_folder)
27
-
28
 
29
  # Large batch sizes generally give good results for translation
30
  effective_train_batch_size = 480
@@ -38,8 +38,8 @@ yaml_config = '''
38
  training_args:
39
  output_dir: "{drive_folder}"
40
  eval_strategy: steps
41
- eval_steps: 100
42
- save_steps: 100
43
  gradient_accumulation_steps: {gradient_accumulation_steps}
44
  learning_rate: 3.0e-4 # Include decimal point to parse as float
45
  # optim: adafactor
@@ -64,24 +64,25 @@ data_dir: .
64
  # Use a 600M parameter model here, which is easier to train on a free Colab
65
  # instance. Bigger models work better, however: results will be improved
66
  # if able to train on nllb-200-1.3B instead.
67
- model_checkpoint: facebook/nllb-200-distilled-600M
68
 
69
  datasets:
70
  train:
71
  huggingface_load:
72
  # We will load two datasets here: English/KSL Gloss, and also SALT
73
  # Swahili/English, so that we can try out multi-way translation.
74
-
75
  - path: EzekielMW/Eksl_dataset
76
  split: train[:-1000]
 
 
77
  - path: sunbird/salt
78
  name: text-all
79
  split: train
80
  source:
81
  # This is a text translation only, no audio.
82
  type: text
83
- # The source text can be any of English, KSL or Swahili.
84
- language: [eng,ksl,swa]
85
  preprocessing:
86
  # The models are case sensitive, so if the training text is all
87
  # capitals, then it will only learn to translate capital letters and
@@ -92,8 +93,8 @@ datasets:
92
  - augment_characters
93
  target:
94
  type: text
95
- # The target text with any of English, KSL or Swahili.
96
- language: [eng,ksl,swa]
97
  # The models are case sensitive: make everything lower case for now.
98
  preprocessing:
99
  - lower_case
@@ -103,21 +104,24 @@ datasets:
103
 
104
  validation:
105
  huggingface_load:
106
- # Use the last 500 of the KSL examples for validation.
107
  - path: EzekielMW/Eksl_dataset
108
  split: train[-1000:]
 
 
 
109
  # Add some Swahili validation text.
110
  - path: sunbird/salt
111
  name: text-all
112
  split: dev
113
  source:
114
  type: text
115
- language: [swa,ksl,eng]
116
  preprocessing:
117
  - lower_case
118
  target:
119
  type: text
120
- language: [swa,ksl,eng]
121
  preprocessing:
122
  - lower_case
123
  allow_same_src_and_tgt_language: False
@@ -134,10 +138,11 @@ config = yaml.safe_load(yaml_config)
134
 
135
  training_settings = transformers.Seq2SeqTrainingArguments(
136
  **config["training_args"])
 
137
  # The pre-trained model that we use has support for some African languages, but
138
  # we need to adapt the tokenizer to languages that it wasn't trained with,
139
  # such as KSL. Here we reuse the token from a different language.
140
- LANGUAGE_CODES = ["eng", "swa", "ksl"]
141
 
142
  code_mapping = {
143
  # Exact/close mapping
@@ -145,6 +150,7 @@ code_mapping = {
145
  'swa': 'swh_Latn',
146
  # Random mapping
147
  'ksl': 'ace_Latn',
 
148
  }
149
  tokenizer = transformers.NllbTokenizer.from_pretrained(
150
  config['model_checkpoint'],
@@ -155,7 +161,9 @@ offset = tokenizer.sp_model_size + tokenizer.fairseq_offset
155
 
156
  for code in LANGUAGE_CODES:
157
  i = tokenizer.convert_tokens_to_ids(code_mapping[code])
158
- tokenizer._added_tokens_encoder[code] = i
 
 
159
 
160
  # Define a translation function
161
  def translate(text, source_language, target_language):
 
16
  )
17
 
18
  # Load the model and tokenizer
19
+ tokenizer = AutoTokenizer.from_pretrained("EzekielMW/LuoKslGloss")
20
+ model = AutoModelForSeq2SeqLM.from_pretrained("EzekielMW/LuoKslGloss")
21
 
22
+ # Where should output files be stored locally
23
  # Where should output files be stored locally
24
  drive_folder = "./serverlogs"
25
 
26
  if not os.path.exists(drive_folder):
27
+ %mkdir $drive_folder
 
28
 
29
  # Large batch sizes generally give good results for translation
30
  effective_train_batch_size = 480
 
38
  training_args:
39
  output_dir: "{drive_folder}"
40
  eval_strategy: steps
41
+ eval_steps: 200
42
+ save_steps: 200
43
  gradient_accumulation_steps: {gradient_accumulation_steps}
44
  learning_rate: 3.0e-4 # Include decimal point to parse as float
45
  # optim: adafactor
 
64
  # Use a 600M parameter model here, which is easier to train on a free Colab
65
  # instance. Bigger models work better, however: results will be improved
66
  # if able to train on nllb-200-1.3B instead.
67
+ model_checkpoint: facebook/nllb-200-1.3B
68
 
69
  datasets:
70
  train:
71
  huggingface_load:
72
  # We will load two datasets here: English/KSL Gloss, and also SALT
73
  # Swahili/English, so that we can try out multi-way translation.
 
74
  - path: EzekielMW/Eksl_dataset
75
  split: train[:-1000]
76
+ - path: EzekielMW/Luo_Swa
77
+ split: train[:-2000]
78
  - path: sunbird/salt
79
  name: text-all
80
  split: train
81
  source:
82
  # This is a text translation only, no audio.
83
  type: text
84
+ # The source text can be any of English, KSL, Swahili or Dholuo.
85
+ language: [eng,ksl,swa,luo]
86
  preprocessing:
87
  # The models are case sensitive, so if the training text is all
88
  # capitals, then it will only learn to translate capital letters and
 
93
  - augment_characters
94
  target:
95
  type: text
96
+ # The target text with any of English, KSL, Swahili or Dholuo.
97
+ language: [eng,ksl,swa,luo]
98
  # The models are case sensitive: make everything lower case for now.
99
  preprocessing:
100
  - lower_case
 
104
 
105
  validation:
106
  huggingface_load:
107
+ # Use the last 1000 of the KSL examples for validation.
108
  - path: EzekielMW/Eksl_dataset
109
  split: train[-1000:]
110
+ # Use the last 2000 of the Luo examples for validation.
111
+ - path: EzekielMW/Luo_Swa
112
+ split: train[-2000:]
113
  # Add some Swahili validation text.
114
  - path: sunbird/salt
115
  name: text-all
116
  split: dev
117
  source:
118
  type: text
119
+ language: [swa,ksl,eng,luo]
120
  preprocessing:
121
  - lower_case
122
  target:
123
  type: text
124
+ language: [swa,ksl,eng,luo]
125
  preprocessing:
126
  - lower_case
127
  allow_same_src_and_tgt_language: False
 
138
 
139
  training_settings = transformers.Seq2SeqTrainingArguments(
140
  **config["training_args"])
141
+
142
  # The pre-trained model that we use has support for some African languages, but
143
  # we need to adapt the tokenizer to languages that it wasn't trained with,
144
  # such as KSL. Here we reuse the token from a different language.
145
+ LANGUAGE_CODES = ["eng", "swa", "ksl","luo"]
146
 
147
  code_mapping = {
148
  # Exact/close mapping
 
150
  'swa': 'swh_Latn',
151
  # Random mapping
152
  'ksl': 'ace_Latn',
153
+ 'luo': 'luo_Latn',
154
  }
155
  tokenizer = transformers.NllbTokenizer.from_pretrained(
156
  config['model_checkpoint'],
 
161
 
162
  for code in LANGUAGE_CODES:
163
  i = tokenizer.convert_tokens_to_ids(code_mapping[code])
164
+ tokenizer._added_tokens_encoder[code] = I
165
+
166
+ transformers.generation.utils.ForcedBOSTokenLogitsProcessor = transformers.ForcedBOSTokenLogitsProcessor
167
 
168
  # Define a translation function
169
  def translate(text, source_language, target_language):