File size: 1,757 Bytes
0599b65
55b7f86
0599b65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55b7f86
e2052ea
0599b65
e2052ea
 
 
0599b65
5f661ec
e2052ea
 
 
 
 
0599b65
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import gradio as gr
import librosa
import torch
import torchaudio
import numpy as np

from transformers import WhisperTokenizer
from transformers import WhisperProcessor
from transformers import WhisperFeatureExtractor
from transformers import WhisperForConditionalGeneration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = os.environ.get("HF_REPO_ID")
access_token = os.environ.get("HF_TOKEN")

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path, token=access_token)
tokenizer = WhisperTokenizer.from_pretrained(model_path, token=access_token)
processor = WhisperProcessor.from_pretrained(model_path, token=access_token)
model = WhisperForConditionalGeneration.from_pretrained(model_path, token=access_token).to(device)


def transcribe_audio(file_path):
    speech_array, sampling_rate = torchaudio.load(file_path, format="wav")
    speech_array = speech_array[0].numpy()
    speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000)
    input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features

    # batch = processor.feature_extractor.pad(input_features, return_tensors="pt")
    predicted_ids = model.generate(inputs=input_features.to(device))[0]

    transcription = processor.decode(predicted_ids, skip_special_tokens=True)

    return transcription


# Create a list of example audio files
examples = [f"test_sample/{x}" for x in os.listdir("test_sample")]

# Create the Gradio interface
interface = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Textbox(),
    examples=examples
)

# Launch the interface
interface.launch()