File size: 3,107 Bytes
e0cb286
 
5b74a4b
544b0f8
83e3ccb
c23e905
 
5b74a4b
49f6e13
bc3fe61
 
 
eda1d8a
 
 
 
 
def416c
a927d1d
2f47955
 
 
0639911
75c9b3b
2f47955
bc3fe61
75c9b3b
 
bc3fe61
75c9b3b
 
bc3fe61
75c9b3b
 
 
393002d
eda1d8a
 
cd0ec84
eda1d8a
4804944
dfd48ca
eda1d8a
 
 
 
 
 
 
cd0ec84
75c9b3b
 
25fb027
75c9b3b
 
 
 
 
 
c58bd88
75c9b3b
 
8c23bfa
75c9b3b
 
 
 
65b0d6a
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import torch
import librosa
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoProcessor, SeamlessM4Tv2Model, pipeline, AutoTokenizer
import numpy as np
import soundfile as sf
import tempfile


# Load the models and processors
asr_model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
asr_processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")

# Load the SeamlessM4T model and processor
translator_model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
translator_processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")

tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")

def translate_speech(audio_file_path):
    # Load the audio file as a floating point time series
    audio_data, sample_rate = librosa.load(audio_file_path, sr=16000)

    # Prepare the input dictionary
    input_dict = asr_processor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True)  # Pass the resampled audio_data here

    # Use the ASR model to get the logits
    logits = asr_model(input_dict.input_values.to("cpu")).logits

    # Get the predicted IDs
    pred_ids = torch.argmax(logits, dim=-1)[0]

    # Decode the predicted IDs to get the transcription
    transcription = asr_processor.decode(pred_ids)
    print(f"Transcription: {transcription}")  # Print the transcription

    # Prepare the input dictionary for the translator
    text_inputs = translator_processor(text=transcription, src_lang="eng", return_tensors="pt")

    # Use the translator model to translate the transcription
    translated_text = translator_model.generate(**text_inputs, tgt_lang="hau")  # Change the target language to Hausa

    # Decode the translated text
    translated_text_str = translator_processor.decode(translated_text[0])

    # Remove special tokens
    translated_text_str = translated_text_str.replace("<pad>", "").replace("</s>", "").strip()

    print(f"Translated text string: {translated_text_str}")  # Print the translated text string

    # Use the text-to-speech pipeline to synthesize the translated text
    synthesised_speech = tts(translated_text_str)

    # Check if the synthesised speech contains 'audio'
    if 'audio' in synthesised_speech:
        synthesised_speech_data = synthesised_speech['audio']
    else:
        print("The synthesised speech does not contain 'audio'")
        return

    # Flatten the audio data
    synthesised_speech_data = synthesised_speech_data.flatten()

    # Scale the audio data to the range of int16 format
    synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)

    return 16000, synthesised_speech

# Define the Gradio interface
iface = gr.Interface(
    fn=translate_speech, 
    inputs=gr.inputs.Audio(type="filepath"),  # Change this line
    outputs=gr.outputs.Audio(type="numpy"),
    title="English to Hausa Translation",
    description="Realtime demo for English to Hausa translation using speech recognition and text-to-speech synthesis."
)

iface.launch()