File size: 2,943 Bytes
dad2a9b
 
bdc7600
 
 
 
 
 
 
 
 
 
 
 
 
 
dad2a9b
d09d492
 
 
 
 
bdc7600
d09d492
bdc7600
 
dad4b00
bdc7600
d09d492
bdc7600
 
dad4b00
bdc7600
dad4b00
bdc7600
d09d492
 
 
 
bdc7600
 
 
 
dad4b00
 
bdc7600
 
 
 
 
 
 
 
 
 
 
 
 
d09d492
bdc7600
 
 
 
 
 
 
dad4b00
bdc7600
 
d09d492
 
bdc7600
 
 
 
dad4b00
bdc7600
dad4b00
bdc7600
 
 
 
dad4b00
bdc7600
d09d492
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
import os
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import numpy as np
import librosa

# Initialize Whisper model
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

# Set light green theme
theme = gr.themes.Base(
    primary_hue="emerald",
    secondary_hue="emerald",
    neutral_hue="gray",
)

def validate_file(file_path):
    # Check if file exists
    if not file_path or not os.path.exists(file_path):
        return False, "No file uploaded or file not found."
    
    # Check file size (25 MB limit)
    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
    if file_size_mb > 25:
        return False, f"File size is {file_size_mb:.2f} MB. Please upload a file smaller than 25 MB."
    
    # Check file extension
    file_extension = os.path.splitext(file_path)[1].lower()
    if file_extension not in ['.mp3', '.wav']:
        return False, "Only .mp3 and .wav formats are supported."
    
    return True, "File is valid."

def transcribe_audio(audio_file):
    # Check if audio_file is None
    if audio_file is None:
        return "Please upload an audio file."
    
    # Validate the file first
    is_valid, message = validate_file(audio_file)
    if not is_valid:
        return message
    
    try:
        # Load audio file
        speech_array, sampling_rate = librosa.load(audio_file, sr=16000)
        
        # Process the audio file
        input_features = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_features
        
        # Generate token ids
        predicted_ids = model.generate(input_features)
        
        # Decode token ids to text
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        return transcription
        
    except Exception as e:
        return f"An error occurred during transcription: {str(e)}"

# Create Gradio interface
with gr.Blocks(theme=theme) as demo:
    gr.Markdown("# Audio Transcription with Whisper")
    gr.Markdown("Upload an audio file (.mp3 or .wav) of maximum 25MB to get the transcription.")
    
    with gr.Row():
        with gr.Column():
            # Fixed: Use sources parameter instead of type
            audio_input = gr.Audio(sources=["upload"], label="Upload Audio File")
            submit_btn = gr.Button("Transcribe", variant="primary")
        
        with gr.Column():
            output = gr.Textbox(label="Transcription Result", lines=10)
    
    submit_btn.click(fn=transcribe_audio, inputs=audio_input, outputs=output)
    
    gr.Markdown("### Limitations")
    gr.Markdown("- Maximum file size: 25 MB")
    gr.Markdown("- Supported formats: .mp3 and .wav")
    gr.Markdown("- Uses the Whisper base model which works best with clear audio")

# Launch the app
if __name__ == "__main__":
    demo.launch()