File size: 2,563 Bytes
d07df4a
867e47c
d07df4a
2aa273f
 
4d2fe8e
cf5e40a
d07df4a
c97e116
2aa273f
 
 
 
d07df4a
 
2aa273f
 
 
 
 
 
 
 
 
34f8d61
2aa273f
d07df4a
2aa273f
d07df4a
 
 
2aa273f
d07df4a
2aa273f
d07df4a
2aa273f
 
d07df4a
 
 
 
 
2aa273f
d07df4a
2aa273f
 
 
 
d07df4a
2aa273f
d07df4a
2aa273f
 
d07df4a
 
2aa273f
1d88b62
2aa273f
1d88b62
dc41b9b
d07df4a
2aa273f
1d88b62
2aa273f
1d88b62
 
d07df4a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer
import gradio as gr
from pydub import AudioSegment
import os

# Load Whisper model for transcription
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_model.eval()
torch.set_grad_enabled(False)

# Load MarianMT model for translation (English → Spanish)
translation_model_name = "Helsinki-NLP/opus-mt-en-es"
translator = MarianMTModel.from_pretrained(translation_model_name)
tokenizer = MarianTokenizer.from_pretrained(translation_model_name)

def translate_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = translator.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

def transcribe_and_translate(filepath):
    if filepath is None or not os.path.exists(filepath):
        return "No audio file provided."

    audio = AudioSegment.from_file(filepath).set_channels(1)
    chunk_length_ms = 30 * 1000
    chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]

    full_transcript = ""

    for i, chunk in enumerate(chunks):
        chunk_path = f"chunk_{i}.wav"
        chunk.export(chunk_path, format="wav")
        waveform, sr = torchaudio.load(chunk_path)
        os.remove(chunk_path)

        if sr != 16000:
            waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)

        waveform = waveform.mean(dim=0)
        inputs = whisper_processor(waveform, sampling_rate=16000, return_tensors="pt")
        predicted_ids = whisper_model.generate(inputs["input_features"])
        transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

        full_transcript += transcription.strip() + " "

    translated_text = translate_text(full_transcript.strip())
    return translated_text

mic_ui = gr.Interface(
    fn=transcribe_and_translate,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Textbox(label="Translation (English Speech → Spanish Text)"),
)

file_ui = gr.Interface(
    fn=transcribe_and_translate,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Textbox(label="Translation (English Speech → Spanish Text)"),
)

app = gr.TabbedInterface([mic_ui, file_ui], ["Microphone Input", "Upload File"])
app.launch()