chrisjay
/

fonxlsr

@@ -43,96 +43,16 @@ import torchaudio
 from datasets import load_dataset
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-#This will download the files from Layele's Github to the directory FonAudio
-if not os.path.isdir("./FonAudio"):
-  !wget https://github.com/laleye/pyFongbe/archive/master/data.zip
-  with zipfile.ZipFile("data.zip","r") as zip_ref:
-    zip_ref.extractall("./FonAudio")
-with open('./FonAudio/pyFongbe-master/data/train.csv', newline='',encoding='UTF-8') as f:
-      reader = csv.reader(f)
-      data = list(reader)
-      train_data = [data[i] for i in range(len(data)) if i!=0]
-with open('./FonAudio/pyFongbe-master/data/test.csv', newline='',encoding='UTF-8') as f:
-      reader = csv.reader(f)
-      data = list(reader)
-      t_data = [data[i] for i in range(len(data)) if i!=0]
-#Get valid indices
-random.seed(42) #this seed was used specifically to compare
-                # with Okwugbe model (https://arxiv.org/abs/2103.07762)
-v = 1500
-test_list = [i for i in range(len(t_data))]
-valid_indices = random.choices(test_list, k=v)
-test_data = [t_data[i] for i in range(len(t_data)) if i not in valid_indices]
-valid_data = [t_data[i] for i in range(len(t_data)) if i in valid_indices]
-#Final length of validation_dataset -> 1107
-#Final length of test_dataset -> 1061
-#Please note, the final validation size is is smaller than the
-#expected (1500) because we used random.choices which could contain duplicates.
-#Create JSON files
-def create_json_file(d):
-  utterance = d[2]
-  wav_path =d[0]
-  wav_path = wav_path.replace("/home/frejus/Projects/Fongbe_ASR/pyFongbe","./FonAudio/pyFongbe-master")
-  return {
-      "path": wav_path,
-      "sentence": utterance
-  }
-train_json = [create_json_file(i) for i in train_data]
-test_json = [create_json_file(i) for i in test_data]
-valid_json = [create_json_file(i) for i in valid_data]
-#Save JSON files to your Google Drive folders
-#Make folder in GDrive to store files
-train_path = '/content/drive/MyDrive/fon_xlsr/train'
-test_path = '/content/drive/MyDrive/fon_xlsr/test'
-valid_path = '/content/drive/MyDrive/fon_xlsr/valid'
-if not os.path.isdir(train_path):
-  print("Creating paths")
-  os.makedirs(train_path)
-  os.makedirs(test_path) #this is where we save the test files
-  os.makedirs(valid_path)
-#for train
-for i, sample in enumerate(train_json):
-  file_path = os.path.join(train_path,'train_fon_{}.json'.format(i))
-  with open(file_path, 'w') as outfile:
-    json.dump(sample, outfile)
-#for test
-for i, sample in enumerate(test_json):
-  file_path = os.path.join(test_path,'test_fon_{}.json'.format(i))
-  with open(file_path, 'w') as outfile:
-    json.dump(sample, outfile)
-#for valid
-for i, sample in enumerate(valid_json):
-  file_path = os.path.join(valid_path,'valid_fon_{}.json'.format(i))
-  with open(file_path, 'w') as outfile:
-    json.dump(sample, outfile)
 #Load test_dataset from saved files in folder
 from datasets import load_dataset, load_metric
 #for test
-for root, dirs, files in os.walk(test_path):
   test_dataset= load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
 #Remove unnecessary chars
-chars_to_ignore_regex =
 def remove_special_characters(batch):
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
     return batch
@@ -176,10 +96,10 @@ from datasets import load_dataset, load_metric
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import re
-for root, dirs, files in os.walk(test_path):
   test_dataset = load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
-chars_to_ignore_regex =
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
     return batch
@@ -187,7 +107,7 @@ test_dataset = test_dataset.map(remove_special_characters)
 wer = load_metric("wer")
 processor = Wav2Vec2Processor.from_pretrained("chrisjay/wav2vec2-large-xlsr-53-fon")
-model = Wav2Vec2ForCTC.from_pretrained("chrisjay/wav2vec2-large-xlsr-53-fon") #use checkpoint-12400 to get our WER test results
 model.to("cuda")
 # Preprocessing the datasets.

 from datasets import load_dataset
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 #Load test_dataset from saved files in folder
 from datasets import load_dataset, load_metric
 #for test
+for root, dirs, files in os.walk(test/):
   test_dataset= load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
 #Remove unnecessary chars
+chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”]'
 def remove_special_characters(batch):
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
     return batch
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import re
+for root, dirs, files in os.walk(test/):
   test_dataset = load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
+chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”]'
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
     return batch
 wer = load_metric("wer")
 processor = Wav2Vec2Processor.from_pretrained("chrisjay/wav2vec2-large-xlsr-53-fon")
+model = Wav2Vec2ForCTC.from_pretrained("chrisjay/wav2vec2-large-xlsr-53-fon")
 model.to("cuda")
 # Preprocessing the datasets.