speech-to-speech-translation-Elesin2

Sleeping

App Files Files Community

speech-to-speech-translation-Elesin2 / app.py

vladelesin

Update app.py

866bb2a over 1 year ago

raw

history blame

2.32 kB

	"""
	Automatically generated by Colaboratory.
	Original file is located at
	https://colab.research.google.com/drive/16MxXQeF3O0htL9eQ61aa6ZxnApGg9TKN
	"""

	import gradio as gr
	import numpy as np
	import torch
	import phonemizer

	from transformers import pipeline
	from transformers import VitsModel, VitsTokenizer, FSMTForConditionalGeneration, FSMTTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor, MarianMTModel, MarianTokenizer, T5ForConditionalGeneration, T5Tokenizer

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# Transform audio to en text
	asr_pipe = pipeline("automatic-speech-recognition", model="asapp/sew-d-tiny-100k-ft-ls100h", device=device)

	# Translate en to rus text
	translation_en_to_rus = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")

	# Create speech from rus text
	model = VitsModel.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
	tokenizer = VitsTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")

	#model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
	#tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")


	def translate(audio):
	en_text = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
	translated_text = translation_en_to_rus(en_text["text"])
	return translated_text[0]['translation_text']

	def synthesise(text):
	translated_text = translator(text)
	inputs = tokenizer(translated_text, return_tensors="pt")
	with torch.no_grad():
	speech = model(**inputs).waveform
	return speech.cpu()

	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
	return 16000, synthesised_speech[0]

	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy")
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["./example.wav"]]
	)


	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	demo.launch()