QuadLingualModel

Sleeping

App Files Files Community

EzekielMW commited on Jan 26

Commit

019a11e

verified ·

1 Parent(s): 9d6e290

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -17

app.py CHANGED Viewed

@@ -16,15 +16,15 @@ app.add_middleware(
 )
 # Load the model and tokenizer
-tokenizer = AutoTokenizer.from_pretrained("EzekielMW/Eksl_dataset")
-model = AutoModelForSeq2SeqLM.from_pretrained("EzekielMW/Eksl_dataset")
 # Where should output files be stored locally
 drive_folder = "./serverlogs"
 if not os.path.exists(drive_folder):
-    os.makedirs(drive_folder)
 # Large batch sizes generally give good results for translation
 effective_train_batch_size = 480
@@ -38,8 +38,8 @@ yaml_config = '''
 training_args:
   output_dir: "{drive_folder}"
   eval_strategy: steps
-  eval_steps: 100
-  save_steps: 100
   gradient_accumulation_steps: {gradient_accumulation_steps}
   learning_rate: 3.0e-4  # Include decimal point to parse as float
   # optim: adafactor
@@ -64,24 +64,25 @@ data_dir: .
 # Use a 600M parameter model here, which is easier to train on a free Colab
 # instance. Bigger models work better, however: results will be improved
 # if able to train on nllb-200-1.3B instead.
-model_checkpoint: facebook/nllb-200-distilled-600M
 datasets:
   train:
     huggingface_load:
       # We will load two datasets here: English/KSL Gloss, and also SALT
       # Swahili/English, so that we can try out multi-way translation.
       - path: EzekielMW/Eksl_dataset
         split: train[:-1000]
       - path: sunbird/salt
         name: text-all
         split: train
     source:
       # This is a text translation only, no audio.
       type: text
-      # The source text can be any of English, KSL or Swahili.
-      language: [eng,ksl,swa]
       preprocessing:
         # The models are case sensitive, so if the training text is all
         # capitals, then it will only learn to translate capital letters and
@@ -92,8 +93,8 @@ datasets:
         - augment_characters
     target:
       type: text
-      # The target text with any of English, KSL or Swahili.
-      language: [eng,ksl,swa]
       # The models are case sensitive: make everything lower case for now.
       preprocessing:
         - lower_case
@@ -103,21 +104,24 @@ datasets:
   validation:
     huggingface_load:
-      # Use the last 500 of the KSL examples for validation.
       - path: EzekielMW/Eksl_dataset
         split: train[-1000:]
       # Add some Swahili validation text.
       - path: sunbird/salt
         name: text-all
         split: dev
     source:
       type: text
-      language: [swa,ksl,eng]
       preprocessing:
         - lower_case
     target:
       type: text
-      language: [swa,ksl,eng]
       preprocessing:
         - lower_case
     allow_same_src_and_tgt_language: False
@@ -134,10 +138,11 @@ config = yaml.safe_load(yaml_config)
 training_settings = transformers.Seq2SeqTrainingArguments(
     **config["training_args"])
 # The pre-trained model that we use has support for some African languages, but
 # we need to adapt the tokenizer to languages that it wasn't trained with,
 # such as KSL. Here we reuse the token from a different language.
-LANGUAGE_CODES = ["eng", "swa", "ksl"]
 code_mapping = {
     # Exact/close mapping
@@ -145,6 +150,7 @@ code_mapping = {
     'swa': 'swh_Latn',
     # Random mapping
     'ksl': 'ace_Latn',
 }
 tokenizer = transformers.NllbTokenizer.from_pretrained(
     config['model_checkpoint'],
@@ -155,7 +161,9 @@ offset = tokenizer.sp_model_size + tokenizer.fairseq_offset
 for code in LANGUAGE_CODES:
     i = tokenizer.convert_tokens_to_ids(code_mapping[code])
-    tokenizer._added_tokens_encoder[code] = i
 # Define a translation function
 def translate(text, source_language, target_language):

 )
 # Load the model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("EzekielMW/LuoKslGloss")
+model = AutoModelForSeq2SeqLM.from_pretrained("EzekielMW/LuoKslGloss")
+# Where should output files be stored locally
 # Where should output files be stored locally
 drive_folder = "./serverlogs"
 if not os.path.exists(drive_folder):
+  %mkdir $drive_folder
 # Large batch sizes generally give good results for translation
 effective_train_batch_size = 480
 training_args:
   output_dir: "{drive_folder}"
   eval_strategy: steps
+  eval_steps: 200
+  save_steps: 200
   gradient_accumulation_steps: {gradient_accumulation_steps}
   learning_rate: 3.0e-4  # Include decimal point to parse as float
   # optim: adafactor
 # Use a 600M parameter model here, which is easier to train on a free Colab
 # instance. Bigger models work better, however: results will be improved
 # if able to train on nllb-200-1.3B instead.
+model_checkpoint: facebook/nllb-200-1.3B
 datasets:
   train:
     huggingface_load:
       # We will load two datasets here: English/KSL Gloss, and also SALT
       # Swahili/English, so that we can try out multi-way translation.
       - path: EzekielMW/Eksl_dataset
         split: train[:-1000]
+      - path: EzekielMW/Luo_Swa
+        split: train[:-2000]
       - path: sunbird/salt
         name: text-all
         split: train
     source:
       # This is a text translation only, no audio.
       type: text
+      # The source text can be any of English, KSL, Swahili or Dholuo.
+      language: [eng,ksl,swa,luo]
       preprocessing:
         # The models are case sensitive, so if the training text is all
         # capitals, then it will only learn to translate capital letters and
         - augment_characters
     target:
       type: text
+      # The target text with any of English, KSL, Swahili or Dholuo.
+      language: [eng,ksl,swa,luo]
       # The models are case sensitive: make everything lower case for now.
       preprocessing:
         - lower_case
   validation:
     huggingface_load:
+      # Use the last 1000 of the KSL examples for validation.
       - path: EzekielMW/Eksl_dataset
         split: train[-1000:]
+      # Use the last 2000 of the Luo examples for validation.
+      - path: EzekielMW/Luo_Swa
+        split: train[-2000:]
       # Add some Swahili validation text.
       - path: sunbird/salt
         name: text-all
         split: dev
     source:
       type: text
+      language: [swa,ksl,eng,luo]
       preprocessing:
         - lower_case
     target:
       type: text
+      language: [swa,ksl,eng,luo]
       preprocessing:
         - lower_case
     allow_same_src_and_tgt_language: False
 training_settings = transformers.Seq2SeqTrainingArguments(
     **config["training_args"])
 # The pre-trained model that we use has support for some African languages, but
 # we need to adapt the tokenizer to languages that it wasn't trained with,
 # such as KSL. Here we reuse the token from a different language.
+LANGUAGE_CODES = ["eng", "swa", "ksl","luo"]
 code_mapping = {
     # Exact/close mapping
     'swa': 'swh_Latn',
     # Random mapping
     'ksl': 'ace_Latn',
+    'luo': 'luo_Latn',
 }
 tokenizer = transformers.NllbTokenizer.from_pretrained(
     config['model_checkpoint'],
 for code in LANGUAGE_CODES:
     i = tokenizer.convert_tokens_to_ids(code_mapping[code])
+    tokenizer._added_tokens_encoder[code] = I
+transformers.generation.utils.ForcedBOSTokenLogitsProcessor = transformers.ForcedBOSTokenLogitsProcessor
 # Define a translation function
 def translate(text, source_language, target_language):