Spaces:

NLPV
/

BihariVernacular

Sleeping

File size: 3,367 Bytes


# -*- coding: utf-8 -*-
"""
Created on Mon Dec  9 16:43:31 2024

@author: Pradeep Kumar
"""
import whisper
import torch
import os
import gradio as gr
from deep_translator import GoogleTranslator

# Check if NVIDIA GPU is available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Directories for transcripts
BASE_DIR = os.getcwd()
TRANSCRIPTS_FOLDER = os.path.join(BASE_DIR, 'transcripts')

# Ensure transcripts directory exists
def check_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

check_directory(TRANSCRIPTS_FOLDER)

def transcribe_and_translate(audio_file, selected_language, model_type="base"):
    """
    Transcribe audio using Whisper and translate it into English if required.
    
    :param audio_file: Path to the uploaded audio file
    :param selected_language: Language code for transcription
    :param model_type: Whisper model type (default is 'base')
    :return: Transcription and translation
    """
    temp_audio_path = os.path.join(BASE_DIR, audio_file.name)
    
    # Save the uploaded file to a temporary location
    with open(temp_audio_path, "wb") as f:
        f.write(audio_file.read())

    try:
        # Load the Whisper model based on user selection
        model = whisper.load_model(model_type, device=DEVICE)
    except Exception as e:
        return f"Failed to load Whisper model ({model_type}): {e}"

    try:
        # Transcribe with the user-selected language
        if selected_language:
            result = model.transcribe(temp_audio_path, language=selected_language, verbose=False)
        else:
            return "Language selection is required."

        # Save the transcription with timestamps
        transcript_file = os.path.join(TRANSCRIPTS_FOLDER, f"{audio_file.name}_transcript.txt")
        
        translated_text = []
        with open(transcript_file, 'w', encoding='utf-8') as text_file:
            for segment in result['segments']:
                start_time = segment['start']
                end_time = segment['end']
                text = segment['text']
                text_file.write(f"[{start_time:.2f} - {end_time:.2f}] {text}\n")
                if selected_language in ['Dutch', 'English']:
                    text_en = GoogleTranslator(source='auto', target='en').translate(text)
                    translated_text.append(f"[{start_time:.2f} - {end_time:.2f}] {text_en}")
                    text_file.write(f"[{start_time:.2f} - {end_time:.2f}] {text_en}\n")

        # Return the transcription and translation
        return "\n".join(translated_text) if translated_text else result['text']

    except Exception as e:
        return f"Failed to process the audio file: {e}"

    finally:
        # Clean up temporary audio file
        if os.path.exists(temp_audio_path):
            os.remove(temp_audio_path)

# Define the Gradio interface
interface = gr.Interface(
    fn=transcribe_and_translate,
    inputs=[
        gr.Audio(source="upload", type="file", label="Upload Audio"),
        gr.Dropdown(label="Select Language", choices=["nl","en"], value="mai"),
        gr.Dropdown(label="Select Model Type", choices=["tiny", "base", "small", "medium", "large"], value="base")
    ],
    outputs="text",
    title="Transcription and Translation"
)

if __name__ == '__main__':
    # Launch the Gradio interface
    interface.launch()