File size: 2,736 Bytes
90951fb
 
34a09fd
90951fb
34a09fd
90951fb
 
 
 
 
38322ca
90951fb
 
 
 
 
 
 
 
 
9a34dcd
90951fb
9a34dcd
34a09fd
9a34dcd
34a09fd
 
 
90951fb
 
 
 
 
34a09fd
9a34dcd
 
 
90951fb
9a34dcd
 
 
 
 
90951fb
 
 
 
 
b589aed
90951fb
34a09fd
90951fb
9a34dcd
 
34a09fd
 
 
9a34dcd
34a09fd
3169e8f
9a34dcd
34a09fd
 
 
9a34dcd
34a09fd
90951fb
e4dba21
90951fb
34a09fd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import torch
import os
import gradio as gr
from deep_translator import GoogleTranslator
import whisper

# Check if NVIDIA GPU is available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Directories for transcripts
BASE_DIR = os.getcwd()
TRANSCRIPTS_FOLDER = os.path.join(BASE_DIR, 'transcripts')

# Ensure transcripts directory exists
def check_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

check_directory(TRANSCRIPTS_FOLDER)

def live_transcribe_and_translate(stream, selected_language, model_type="base"):
    """
    Transcribe live audio using Whisper and translate it into English if required.
    
    :param stream: Stream of live audio data
    :param selected_language: Language code for transcription
    :param model_type: Whisper model type (default is 'base')
    :return: Transcription and translation
    """
    try:
        # Load the Whisper model based on user selection
        model = whisper.load_model(model_type, device=DEVICE)
    except Exception as e:
        return f"Failed to load Whisper model ({model_type}): {e}"
    
    # Prepare audio processor
    audio_processor = whisper.audio.AudioProcessor(model, streaming=True)

    translated_text = []
    transcript_file = os.path.join(TRANSCRIPTS_FOLDER, 'live_transcript.txt')
    with open(transcript_file, 'w', encoding='utf-8') as text_file:
        for chunk in stream:
            result = audio_processor.transcribe(chunk, return_timestamps=True)
            for segment in result['segments']:
                start_time = segment['start']
                end_time = segment['end']
                text = segment['text']
                text_file.write(f"[{start_time:.2f} - {end_time:.2f}] {text}\n")
                if selected_language in ['nl']:
                    text_en = GoogleTranslator(source='auto', target='en').translate(text)
                    translated_text.append(f"[{start_time:.2f} - {end_time:.2f}] {text_en}")
                    text_file.write(f"[{start_time:.2f} - {end_time:.2f}] {text_en}\n")
                    
    return "\n".join(translated_text) if translated_text else "Live transcription completed."

# Define the Gradio interface
interface = gr.Interface(
    fn=live_transcribe_and_translate,
    inputs=[
        gr.Audio(type="numpy", label="Upload Audio"),  # Adjusted for pre-recorded or in-memory audio
        gr.Dropdown(label="Select Language", choices=["nl", "en"], value="en"),
        gr.Dropdown(label="Select Model Type", choices=["tiny", "base", "small", "medium", "large"], value="base")
    ],
    outputs="text",
    title="Live Transcription and Translation"
)


if __name__ == '__main__':
    # Launch the Gradio interface
    interface.launch()