File size: 1,660 Bytes
2cddd11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import sys, os
import json
import string
from tqdm import tqdm
def process(text):
# Lower case every letter
text = text.lower()
# Remove punctuation
punctuation_to_remove = string.punctuation.replace("'", "")
translation_table = str.maketrans('', '', punctuation_to_remove)
text = text.translate(translation_table)
# Remove whitespaces from front and behind
while text[0] == ' ' or text[-1] == ' ':
if text[0] == ' ':
text = text[1:]
if text[-1] == ' ':
text = text[:-1]
return text
split_name = "train.other.500"
with open("./blist/all_rare_words.txt") as fin:
rarewords = [process(word.strip()) for word in fin]
with open(f"./transcripts/{split_name}.txt") as fin:
transcripts = [line.strip() for line in fin]
from datasets import load_dataset
cache_dir = "./../cache"
dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True)
train_data = []
pbar = tqdm(dataset[split_name])
for idx, sample in enumerate(pbar):
text = process(sample["text"])
transcript = transcripts[idx]
bwords = []
for word in text.split():
if word in rarewords and word not in transcript:
bwords.append(word)
if len(bwords) > 0:
train_data.append({
"split": split_name,
"idx": idx,
"text": text,
"transcript": transcript,
"b_words": bwords,
})
pbar.set_description(f"Len of train data: {len(train_data)}")
with open(f"./train_data/{split_name}.json", "w") as fout:
json.dump(train_data, fout, indent=4) |