File size: 2,851 Bytes
dde51bf
fc110d0
dde51bf
cd7c511
dde51bf
0196a98
dde51bf
 
 
 
efa7028
fc110d0
 
 
 
dde51bf
efa7028
 
dde51bf
fc110d0
 
 
 
b9710dc
 
 
 
fc110d0
b9710dc
 
 
 
 
 
 
 
 
 
 
 
 
fc110d0
54eca9b
b9710dc
 
fc110d0
 
 
 
 
 
 
 
 
 
 
 
54eca9b
dde51bf
 
 
 
87c7dbc
a8d0349
0fa00fd
dde51bf
 
 
87c7dbc
a8d0349
0fa00fd
dde51bf
 
 
fc110d0
dde51bf
54eca9b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import gradio as gr
import librosa

MODEL_NAME = "EwoutLagendijk/whisper-small-indonesian"
BATCH_SIZE = 8

device = 0 if torch.cuda.is_available() else "cpu"

# Load model and processor
model_name = "EwoutLagendijk/whisper-small-indonesian"

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)

# Update the generation config for transcription
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="id", task="transcribe")

# Initialize the translation pipeline (using a model like `Helsinki-NLP/opus-mt-id-en` for Indonesian to English)
translation_pipeline = pipeline("translation", model="Helsinki-NLP/opus-mt-id-en")

def transcribe_speech(filepath):
    # Load the audio
    audio, sampling_rate = librosa.load(filepath, sr=16000)

    # Define chunk size (e.g., 30 seconds)
    chunk_duration = 5  # in seconds
    chunk_samples = chunk_duration * sampling_rate

    # Process audio in chunks
    transcription = []
    for i in range(0, len(audio), chunk_samples):
        chunk = audio[i:i + chunk_samples]

        # Convert the chunk into input features
        inputs = processor(audio=chunk, sampling_rate=16000, return_tensors="pt").input_features

        # Generate transcription for the chunk
        generated_ids = model.generate(
            inputs,
            max_new_tokens=444,  # Max allowed by Whisper
            forced_decoder_ids=processor.get_decoder_prompt_ids(language="id", task="transcribe")
        )

        # Decode and append the transcription
        chunk_transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        # Translate the transcription to English (or another language of choice)
        chunk_translation = translation_pipeline(chunk_transcription)[0]['translation_text']

        # Append both transcription and translation
        transcription.append(f"Chunk {i//chunk_samples + 1}:\n")
        transcription.append(f"Transcription: {chunk_transcription}\n")
        transcription.append(f"Translation: {chunk_translation}\n\n")

    # Combine all chunk transcriptions and translations into a single string
    return "\n".join(transcription)

demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Textbox(lines=10, label="Microphone output"),
)

file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Textbox(lines=10, label="File output"),
)

with demo:
    gr.TabbedInterface([mic_transcribe, file_transcribe], ["Transcribe and translate Microphone", "Transcribe and translate Audio File"])

demo.launch(debug=True)