Spaces:
Runtime error
Runtime error
File size: 4,686 Bytes
60648c4 34b86ea 60648c4 39e9f1f 60648c4 39e9f1f 60648c4 34b86ea 60648c4 34b86ea 60648c4 34b86ea 60648c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import gradio as gr
import sox
import numpy as np
import yaml
import tensorflow as tf
from tensorflow_tts.inference import TFAutoModel
from tensorflow_tts.inference import AutoProcessor
import scipy.signal as sps
# initialize fastspeech2 model.
fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
# initialize mb_melgan model
mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en")
# inference
processor_tts = AutoProcessor.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
def tts(text):
input_ids = processor_tts.text_to_sequence(text)
# fastspeech inference
mel_before, mel_after, duration_outputs, _, _ = fastspeech2.inference(
input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
f0_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
energy_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
)
# melgan inference
audio_before = mb_melgan.inference(mel_before)[0, :, 0]
audio_after = mb_melgan.inference(mel_after)[0, :, 0]
# save to file
sf.write('./audio_before.wav', audio_before, 22050, "PCM_16")
sf.write('./audio_after.wav', audio_after, 22050, "PCM_16")
return './audio_after.wav'
def convert(inputfile, outfile):
sox_tfm = sox.Transformer()
sox_tfm.set_output_format(
file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
)
sox_tfm.build(inputfile, outfile)
model_translate = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer_translate = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
inlang='hi'
outlang='en'
tokenizer_translate.src_lang = inlang
def translate(text):
encoded_hi = tokenizer_translate(text, return_tensors="pt")
generated_tokens = model_translate.generate(**encoded_hi, forced_bos_token_id=tokenizer_translate.get_lang_id(outlang))
return tokenizer_translate.batch_decode(generated_tokens, skip_special_tokens=True)[0]
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
def read_file(wav):
sample_rate, signal = wav
signal = signal.mean(-1)
number_of_samples = round(len(signal) * float(16000) / sample_rate)
resampled_signal = sps.resample(signal, number_of_samples)
return resampled_signal
def parse_transcription(wav_file):
#filename = wav_file.name.split('.')[0]
#convert(wav_file.name, filename + "16k.wav")
#speech, _ = sf.read(filename + "16k.wav")
speech = read_file(wav_file)
input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
translation = translate(transcription)
return transcription, translation, tts(translation)
output1 = gr.outputs.Textbox(label="Hindi Output from ASR")
output2 = gr.outputs.Textbox(label="English Translated Output")
input_ = gr.inputs.Audio(source="microphone", type="numpy")
output_audio = gr.outputs.Audio(type="file", label="Output Audio")
gr.Interface(parse_transcription, inputs = input_, outputs=[output1, output2, output_audio], analytics_enabled=False,
show_tips=False,
theme='huggingface',
layout='vertical',
title="Vakyansh: Speech To text for Indic Languages",
description="This is a live demo for Speech to Speech Translation. Speak in Hindi and get output in English", enable_queue=True).launch( inline=False)
|