File size: 2,122 Bytes
d1e1d4f
174a3fe
d1e1d4f
 
 
7ce1960
174a3fe
33a55a6
 
 
174a3fe
 
 
 
 
 
 
d1e1d4f
 
 
33a55a6
 
 
 
 
d1e1d4f
 
 
 
174a3fe
d1e1d4f
 
 
 
174a3fe
d1e1d4f
 
 
33a55a6
d1e1d4f
 
174a3fe
 
d1e1d4f
 
 
 
 
 
 
 
 
174a3fe
4c1a6a6
174a3fe
 
 
 
 
d1e1d4f
 
174a3fe
d1e1d4f
174a3fe
 
d1e1d4f
 
9576bde
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
from transformers import pipeline
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from utils import lang_ids

MODEL_NAME = "Pranjal12345/pranjal_whisper_medium"
BATCH_SIZE = 8

model = MBartForConditionalGeneration.from_pretrained("sanjitaa/mbart-many-to-many")
tokenizer = MBart50TokenizerFast.from_pretrained("sanjitaa/mbart-many-to-many")

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device='cpu',
)

lang_list = list(lang_ids.keys())

def split_into_sentences(text):
    sentences = text.replace('?', '.').replace('!', '.').split('.')
    return [sentence.strip() for sentence in sentences if sentence]


def translate_audio(inputs,target_language):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "translate"}, return_timestamps=True)["text"]

    target_lang = lang_ids[target_language]

    if target_language == 'English':
         return text

    else:
        tokenizer.src_lang = "en_XX"
        chunks = split_into_sentences(text)
        translated_text = ''

        for segment in chunks:
                encoded_chunk = tokenizer(segment, return_tensors="pt")
                generated_tokens = model.generate(
                             
                     **encoded_chunk,
                     forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
                )
                translated_chunk = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
                translated_text = translated_text + translated_chunk[0]
        return translated_text

inputs=[
    gr.Audio(label="Audio file"),
    gr.Dropdown(lang_list, value="English", label="Target Language"),
    ]
description = "Audio translation"


translation_interface = gr.Interface(
    fn=translate_audio,
    inputs= inputs,
    outputs="text",
    title="Speech Translation",
    description= description
)

translation_interface.launch()