File size: 1,649 Bytes
c60e096
7752cd2
 
c60e096
7752cd2
 
 
c60e096
7752cd2
 
 
 
 
 
c60e096
7752cd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c60e096
7752cd2
c60e096
6cb9375
c60e096
7752cd2
 
c60e096
 
 
 
7752cd2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import gradio as gr
from faster_whisper import WhisperModel
import logging

# Configure logging for debugging purposes
logging.basicConfig()
logging.getLogger("faster_whisper").setLevel(logging.DEBUG)

# Initialize the Whisper model with your desired configuration
model_size = "large-v3"  # Choose the model size
device = "cpu"  # or "cuda" if GPU is available
compute_type = "float16"  # Choose the compute type based on your hardware

model = WhisperModel(model_size=model_size, device=device, compute_type=compute_type)

def transcribe(audio_file):
    # Enable word-level timestamps
    segments, _ = model.transcribe(audio_file, word_timestamps=True)
    
    # Format and gather transcription with timestamps
    transcription_with_timestamps = []
    for segment in segments:
        segment_text = f"[{segment.start:.2f}s - {segment.end:.2f}s] {segment.text}\n"
        # If word-level detail is desired
        word_details = "\n".join(
            f"    [{word.start:.2f}s - {word.end:.2f}s] {word.word}" for word in segment.words
        )
        transcription_with_timestamps.append(segment_text + word_details)
    
    return "\n".join(transcription_with_timestamps)

# Define the Gradio interface
iface = gr.Interface(fn=transcribe,
                     inputs=gr.Audio(sources="upload", type="filepath", label="Upload Audio"),
                     outputs="text",
                     title="Enhanced Whisper Transcription with Timestamps",
                     description="Upload an audio file to get detailed transcription with timestamps using Faster Whisper.")

# Launch the app
if __name__ == "__main__":
    iface.launch()