# Transformers installation # pip install transformers import os from transformers import AutoModelForSeq2SeqLM, AutoTokenizer # define tokenizer and model tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") model = AutoModelForSeq2SeqLM.from_pretrained( "facebook/nllb-200-distilled-600M") # translation function for single sentence def translateSentence(sentence): inputs = tokenizer(sentence, return_tensors="pt") translated_tokens = model.generate( **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=400 ) translatedSentence = tokenizer.batch_decode( translated_tokens, skip_special_tokens=True)[0] return translatedSentence def translateAllTranscripts(): # source text directory directory = os.fsencode("whisper") # go through source text files # translate them to target language (German) #  save translated text files to translatedTranscripts file for file in os.listdir(directory): filename = os.fsdecode(file) print(f'reading {filename}') if (filename.endswith(".txt")): file_path = f"whisper/{filename}" with open(file_path, 'r') as f: sourceText = f.read() # read source text sentences = sourceText.split('.') translationArr = [] for sentence in sentences: #  translate each sentence translatedSentence = translateSentence(sentence) translationArr.append(translatedSentence) # join translated sentences translation = " ".join(translationArr) # save translated transcript fname = "translatedTranscripts/" + filename with open(fname, "w", encoding="UTF8") as ft: ft.write(translation) print("written to: " + fname) def englishToGerman(videoId): file_path = f"whisper/{videoId}.txt" with open(file_path, 'r') as f: sourceText = f.read() # read source text sentences = sourceText.split('.') translationArr = [] for sentence in sentences: #  translate each sentence translatedSentence = translateSentence(sentence) translationArr.append(translatedSentence) # join translated sentences translation = " ".join(translationArr) # save translated transcript fname = f"translatedTranscripts/{videoId}.txt" with open(fname, "w", encoding="UTF8") as ft: ft.write(translation) print("written to: " + fname)