Spaces:
Sleeping
Sleeping
# Transformers installation | |
# pip install transformers | |
import os | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
# define tokenizer and model | |
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") | |
model = AutoModelForSeq2SeqLM.from_pretrained( | |
"facebook/nllb-200-distilled-600M") | |
# translation function for single sentence | |
def translateSentence(sentence): | |
inputs = tokenizer(sentence, return_tensors="pt") | |
translated_tokens = model.generate( | |
**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=400 | |
) | |
translatedSentence = tokenizer.batch_decode( | |
translated_tokens, skip_special_tokens=True)[0] | |
return translatedSentence | |
def translateAllTranscripts(): | |
# source text directory | |
directory = os.fsencode("whisper") | |
# go through source text files | |
# translate them to target language (German) | |
# save translated text files to translatedTranscripts file | |
for file in os.listdir(directory): | |
filename = os.fsdecode(file) | |
print(f'reading {filename}') | |
if (filename.endswith(".txt")): | |
file_path = f"whisper/{filename}" | |
with open(file_path, 'r') as f: | |
sourceText = f.read() # read source text | |
sentences = sourceText.split('.') | |
translationArr = [] | |
for sentence in sentences: | |
# translate each sentence | |
translatedSentence = translateSentence(sentence) | |
translationArr.append(translatedSentence) | |
# join translated sentences | |
translation = " ".join(translationArr) | |
# save translated transcript | |
fname = "translatedTranscripts/" + filename | |
with open(fname, "w", encoding="UTF8") as ft: | |
ft.write(translation) | |
print("written to: " + fname) | |
def englishToGerman(videoId): | |
file_path = f"whisper/{videoId}.txt" | |
with open(file_path, 'r') as f: | |
sourceText = f.read() # read source text | |
sentences = sourceText.split('.') | |
translationArr = [] | |
for sentence in sentences: | |
# translate each sentence | |
translatedSentence = translateSentence(sentence) | |
translationArr.append(translatedSentence) | |
# join translated sentences | |
translation = " ".join(translationArr) | |
# save translated transcript | |
fname = f"translatedTranscripts/{videoId}.txt" | |
with open(fname, "w", encoding="UTF8") as ft: | |
ft.write(translation) | |
print("written to: " + fname) | |