Spaces:
Runtime error
Runtime error
File size: 2,943 Bytes
8cbc035 5b74a4b d75e5af 83e3ccb abd2b24 5b74a4b d75e5af 25fb027 e2984ff 25fb027 cd0ec84 d75e5af cd0ec84 25fb027 a927d1d 393002d abd2b24 f47bfae abd2b24 f47bfae abd2b24 c088ffc b122d68 c088ffc 25fb027 c088ffc 25fb027 393002d 25fb027 393002d 25fb027 393002d 5b74a4b d75e5af cd0ec84 d75e5af cd0ec84 25fb027 c58bd88 8c23bfa 25fb027 17cfe18 25fb027 a5ec736 f47bfae b2c7d3a 5b74a4b 73cf408 8fe6fd5 5b74a4b b2c7d3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import torch
import gradio as gr
from transformers import pipeline, AutoTokenizer, M2M100ForConditionalGeneration
from tokenization_small100 import SMALL100Tokenizer
import numpy as np
from pydub import AudioSegment
# Load the pipeline for speech recognition
pipe = pipeline(
"automatic-speech-recognition",
model="DrishtiSharma/whisper-large-v2-hausa",
tokenizer="DrishtiSharma/whisper-large-v2-hausa"
)
# Load the new translation model and tokenizer
model_name = 'alirezamsh/small100'
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = SMALL100Tokenizer.from_pretrained(model_name)
tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
# Define the function to translate speech
def translate_speech(audio_file):
print(f"Type of audio: {type(audio_file)}, Value of audio: {audio_file}") # Debug line
# Load the audio file with pydub
audio = AudioSegment.from_mp3(audio_file) # Change this line
# Convert the audio to mono and get the raw data
audio = audio.set_channels(1)
audio_data = np.array(audio.get_array_of_samples())
# Convert the numpy array to double
audio_data = audio_data.astype(np.float64)
# Use the speech recognition pipeline to transcribe the audio
output = pipe(audio_data)
print(f"Output: {output}") # Print the output to see what it contains
# Check if the output contains 'text'
if 'text' in output:
transcription = output["text"]
else:
print("The output does not contain 'text'")
return
# Use the new translation model to translate the transcription
text = "translate Hausa to English: " + transcription
tokenizer.tgt_lang = "en"
encoded_text = tokenizer(text, return_tensors="pt")
outputs = model.generate(**encoded_text)
# Decode the tokens into text
translated_text_str = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Use the text-to-speech pipeline to synthesize the translated text
synthesised_speech = tts(translated_text_str)
# Check if the synthesised speech contains 'audio'
if 'audio' in synthesised_speech:
synthesised_speech_data = synthesised_speech['audio']
else:
print("The synthesised speech does not contain 'audio'")
return
# Flatten the audio data
synthesised_speech_data = synthesised_speech_data.flatten()
# Scale the audio data to the range of int16 format
synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
return 16000, synthesised_speech
# Define the Gradio interface
iface = gr.Interface(
fn=translate_speech,
inputs=gr.inputs.Audio(type="filepath"), # Change this line
outputs=gr.outputs.Audio(type="numpy"),
title="Hausa to English Translation",
description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)
iface.launch()
|