File size: 2,161 Bytes
8d7bec1
36bec1c
ec72da9
bbee8bf
 
8d7bec1
 
bc7920f
8d7bec1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbee8bf
8d7bec1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import gradio as gr
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
import sentencepiece
import sounddevice as sd
import soundfile as sf
import tempfile


def translate_voice(audio, target_lang):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
        temp_filename = temp_audio.name
        sf.write(temp_filename, audio, 16000)

        model = whisper.load_model("base").float()

        audio = whisper.load_audio(temp_filename)

        audio = whisper.pad_or_trim(audio)

        mel = whisper.log_mel_spectrogram(audio).to(model.device).float()

        _, probs = model.detect_language(mel)
        options = whisper.DecodingOptions(fp16=False)
        result = whisper.decode(model, mel, options)

        text = result.text
        lang = max(probs, key=probs.get)

        tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
        model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")

        tokenizer.src_lang = target_lang
        encoded_bg = tokenizer(text, return_tensors="pt")
        generated_tokens = model.generate(**encoded_bg)
        translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

        tts = gTTS(text=translated_text, lang=target_lang)
        filename = "to_speech.mp3"
        tts.save(filename)

        return filename, text, translated_text, target_lang


def record_audio():
    fs = 16000
    duration = 5  # Record audio for 5 seconds, you can adjust the duration as needed
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()
    return audio.flatten()


iface = gr.Interface(
    fn=translate_voice,
    inputs=[
        gr.inputs.Audio(type="microphone", label="Speak"),
        gr.inputs.Dropdown(choices=['en', 'ru', 'de', 'fr'], label="Target Language")
    ],
    outputs=[
        gr.outputs.Audio(type="filepath", label="Translated Audio"),
        gr.outputs.Textbox(label="Original Text"),
        gr.outputs.Textbox(label="Translated Text"),
        gr.outputs.Textbox(label="Target Language"),
    ]
)
iface.launch()