File size: 1,565 Bytes
5865692
cf21473
75d8840
365179e
 
75d8840
 
2aa5028
 
 
 
7123f83
 
 
cf21473
 
82c0280
 
 
5865692
7123f83
82c0280
7123f83
 
 
 
 
cf21473
 
 
 
7123f83
 
 
 
 
cf21473
7123f83
cf21473
 
3000240
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import gradio as gr
from audio_processing import process_audio, print_results
import torch
+import spaces


print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("No CUDA GPUs available. Running on CPU.")

def transcribe_audio(audio_file, translate, model_size):
    language_segments, final_segments = process_audio(audio_file, translate=translate, model_size=model_size)
    
    output = "Detected language changes:\n\n"
    for segment in language_segments:
        output += f"Language: {segment['language']}\n"
        output += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n"

    output += f"Transcription with language detection and speaker diarization (using {model_size} model):\n\n"
    for segment in final_segments:
        output += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}) {segment['speaker']}:\n"
        output += f"Original: {segment['text']}\n"
        if translate:
            output += f"Translated: {segment['translated']}\n"
        output += "\n"
    return output

iface = gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Checkbox(label="Enable Translation"),
        gr.Dropdown(choices=["tiny", "base", "small", "medium", "large","large-v2","large-v3"], label="Whisper Model Size", value="small")
    ],
    outputs="text",
    title="WhisperX Audio Transcription and Translation"
)

iface.launch(share=True)