speech-to-speech-translation

Running

App Files Files Community

speech-to-speech-translation / app.py

preetam8

try new finetuned whisper model

e970d56 8 months ago

raw

history blame

3.63 kB

	import gradio as gr
	import librosa
	import logging
	import numpy as np
	import torch

	from transformers import VitsModel, VitsTokenizer, pipeline
	from transformers import WhisperForConditionalGeneration, WhisperProcessor


	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	target_language = "fr"

	# load speech translation checkpoint
	asr_pipe = pipeline("automatic-speech-recognition", model="bofenghuang/whisper-small-cv11-french", device=device)
	# whisper_model_name = "openai/whisper-small"
	# whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
	# whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
	# decoder_ids = whisper_processor.get_decoder_prompt_ids(language=target_language, task="transcribe")

	# load text-to-speech checkpoint
	model = VitsModel.from_pretrained("facebook/mms-tts-fra")
	tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")


	def translate(audio):
	outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": target_language})
	return outputs["text"]

	# def translate(audio):
	# if isinstance(audio, str):
	# # Account for recorded audio
	# audio = {
	# "path": audio,
	# "sampling_rate": 16_000,
	# "array": librosa.load(audio, sr=16_000)[0]
	# }
	# elif audio["sampling_rate"] != 16_000:
	# audio["array"] = librosa.resample(audio["array"], audio["sampling_rate"], 16_000)
	# input_features = whisper_processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features
	# predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=decoder_ids)
	# translated_text = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
	# return translated_text


	def synthesise(text):
	inputs = tokenizer(text, return_tensors="pt")
	with torch.no_grad():
	outputs = model(inputs["input_ids"])
	speech = outputs["waveform"][0]
	logging.info(speech)
	return speech.cpu()


	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	logging.info(f"Translated Text: {translated_text}")
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
	return 16000, synthesised_speech


	title = "Cascaded STST"
	description = """
	Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in French. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
	[SpeechT5 TTS](https://huggingface.co/preetam8/speecht5_finetuned_voxpopuli_fr) model for text-to-speech finetuned for french:

	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""

	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["./example.wav"]],
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	logging.getLogger().setLevel(logging.INFO)
	demo.launch()