video_translator / sourcetoTarget.py
Gotenks1893's picture
Upload 8 files
168a18b
raw
history blame
2.68 kB
# Transformers installation
# pip install transformers
import os
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# define tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained(
"facebook/nllb-200-distilled-600M")
# translation function for single sentence
def translateSentence(sentence):
inputs = tokenizer(sentence, return_tensors="pt")
translated_tokens = model.generate(
**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=400
)
translatedSentence = tokenizer.batch_decode(
translated_tokens, skip_special_tokens=True)[0]
return translatedSentence
def translateAllTranscripts():
# source text directory
directory = os.fsencode("whisper")
# go through source text files
# translate them to target language (German)
#  save translated text files to translatedTranscripts file
for file in os.listdir(directory):
filename = os.fsdecode(file)
print(f'reading {filename}')
if (filename.endswith(".txt")):
file_path = f"whisper/{filename}"
with open(file_path, 'r') as f:
sourceText = f.read() # read source text
sentences = sourceText.split('.')
translationArr = []
for sentence in sentences:
#  translate each sentence
translatedSentence = translateSentence(sentence)
translationArr.append(translatedSentence)
# join translated sentences
translation = " ".join(translationArr)
# save translated transcript
fname = "translatedTranscripts/" + filename
with open(fname, "w", encoding="UTF8") as ft:
ft.write(translation)
print("written to: " + fname)
def englishToGerman(videoId):
file_path = f"whisper/{videoId}.txt"
with open(file_path, 'r') as f:
sourceText = f.read() # read source text
sentences = sourceText.split('.')
translationArr = []
for sentence in sentences:
#  translate each sentence
translatedSentence = translateSentence(sentence)
translationArr.append(translatedSentence)
# join translated sentences
translation = " ".join(translationArr)
# save translated transcript
fname = f"translatedTranscripts/{videoId}.txt"
with open(fname, "w", encoding="UTF8") as ft:
ft.write(translation)
print("written to: " + fname)