storage / prompting /.ipynb_checkpoints /generate_rare_words-checkpoint.py
darshanmakwana's picture
Upload folder using huggingface_hub
2cddd11 verified
import sys, os
import json
import string
from tqdm import tqdm
def process(text):
# Lower case every letter
text = text.lower()
# Remove punctuation
punctuation_to_remove = string.punctuation.replace("'", "")
translation_table = str.maketrans('', '', punctuation_to_remove)
text = text.translate(translation_table)
# Remove whitespaces from front and behind
while text[0] == ' ' or text[-1] == ' ':
if text[0] == ' ':
text = text[1:]
if text[-1] == ' ':
text = text[:-1]
return text
split_name = "train.other.500"
with open("./blist/all_rare_words.txt") as fin:
rarewords = [process(word.strip()) for word in fin]
with open(f"./transcripts/{split_name}.txt") as fin:
transcripts = [line.strip() for line in fin]
from datasets import load_dataset
cache_dir = "./../cache"
dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True)
train_data = []
pbar = tqdm(dataset[split_name])
for idx, sample in enumerate(pbar):
text = process(sample["text"])
transcript = transcripts[idx]
bwords = []
for word in text.split():
if word in rarewords and word not in transcript:
bwords.append(word)
if len(bwords) > 0:
train_data.append({
"split": split_name,
"idx": idx,
"text": text,
"transcript": transcript,
"b_words": bwords,
})
pbar.set_description(f"Len of train data: {len(train_data)}")
with open(f"./train_data/{split_name}.json", "w") as fout:
json.dump(train_data, fout, indent=4)