Spaces:

Lenylvt
/

Whisper-API

Sleeping

File size: 1,647 Bytes

c60e096
7752cd2
 
c60e096
7752cd2
 
 
c60e096
7752cd2
dc2f23e
af1960a
 
7752cd2
af1960a
c60e096
dc2f23e
 
 
 
 
 
 
7752cd2
dc2f23e
3a81031
7752cd2
dc2f23e
3a81031
dc2f23e
 
3a81031
7752cd2
 
c60e096
7752cd2
c60e096
dc2f23e
c60e096
dc2f23e
 
c60e096
 
 
 
7752cd2

import gradio as gr
from faster_whisper import WhisperModel
import logging

# Configure logging for debugging purposes
logging.basicConfig()
logging.getLogger("faster_whisper").setLevel(logging.DEBUG)

# Initialize the Whisper model with your desired configuration
model_size = "large-v3"  # Choose the model size
device = "cpu"  # GPU : cuda  CPU : cpu
compute_type = "int8"  # GPU : float16 or int8 - CPU : int8

model = WhisperModel(model_size, device=device, compute_type=compute_type)

def format_timestamp(seconds):
    """Convert seconds to HH:MM:SS.mmm format."""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds_remainder = seconds % 60
    return f"{hours:02d}:{minutes:02d}:{seconds_remainder:06.3f}"

def transcribe(audio_file):
    # Transcribe the audio file
    segments, _ = model.transcribe(audio_file)
    
    # Format and gather transcription with enhanced timestamps
    transcription_with_timestamps = [
        f"[{format_timestamp(segment.start)} -> {format_timestamp(segment.end)}] {segment.text}"
        for segment in segments
    ]
    
    return "\n".join(transcription_with_timestamps)

# Define the Gradio interface
iface = gr.Interface(fn=transcribe,
                     inputs=gr.inputs.Audio(source="upload", type="file", label="Upload Audio"),
                     outputs="text",
                     title="Whisper Transcription with Enhanced Timestamps",
                     description="Upload an audio file to get transcription with enhanced timestamps in HH:MM:SS.mmm format using Faster Whisper.")

# Launch the app
if __name__ == "__main__":
    iface.launch()