Spaces:

pawipa
/

speech-to-text

Runtime error

File size: 7,520 Bytes

696b78e
e76c7d1
 
 
a6bff07
 
 
 
e76c7d1
a6bff07
 
 
 
 
 
e76c7d1
 
 
 
a6bff07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f95d23
 
 
a6bff07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e76c7d1
 
a6bff07
 
e76c7d1
 
a6bff07
e76c7d1
 
 
a6bff07
 
e76c7d1
 
 
 
a6bff07
 
 
e76c7d1
 
 
 
 
 
 
 
 
a6bff07
 
8813f41
a6bff07
 
 
 
 
 
 
 
 
e76c7d1
 
 
a6bff07
 
 
 
e76c7d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696b78e
e76c7d1
 
696b78e
e76c7d1
1f87b59
e76c7d1
 
1f87b59
e76c7d1
 
 
696b78e
e76c7d1
 
 
3f95d23
e76c7d1
a6bff07
 
1f87b59
a6bff07
1f87b59
 
a6bff07
1f87b59
696b78e
e76c7d1
a6bff07
e76c7d1
 
696b78e
 
e76c7d1
 
 
 
 
 
 
 
 
 
 
 
a6bff07
e76c7d1
 
 
 
1f87b59
e76c7d1
a6bff07
696b78e
 
e76c7d1

import gradio as gr
import time
import os
import zipfile
import torch
import librosa
import soundfile as sf
from transformers import pipeline
from typing import List, Tuple, Generator
import datetime
from pydub import AudioSegment

# Initial model name
MODEL_NAME = "primeline/whisper-tiny-german-1224"
speech_to_text = pipeline("automatic-speech-recognition", model=MODEL_NAME)

# Initial status message
STANDARD_OUTPUT_TEXT = "**Status:**<br>"

def get_file_creation_date(file_path: str) -> str:
    """
    Returns the creation date of a file.

    Args:
        file_path (str): The path to the file.

    Returns:
        str: The creation date in a human-readable format.
    """
    try:
        # Get file statistics
        file_stats = os.stat(file_path)
        
        # Retrieve and format creation time
        creation_time = datetime.datetime.fromtimestamp(file_stats.st_ctime)
        return creation_time.strftime("%Y-%m-%d %H:%M:%S")
    except FileNotFoundError:
        return "File not found."

def load_model(model_name: str):
    """
    Loads the selected Hugging Face model.

    Args:
        model_name (str): The name of the Hugging Face model to load.

    Returns:
        pipeline: The loaded model pipeline.
    """
    return pipeline("automatic-speech-recognition", model=model_name)

def convert_to_wav(file_path: str) -> str:
    """
    Converts audio files to WAV format if necessary.

    Args:
        file_path (str): Path to the uploaded audio file.

    Returns:
        str: Path to the converted WAV file.
    """
    if file_path.endswith(".m4a") or file_path.endswith(".aac"):
        audio = AudioSegment.from_file(file_path)
        wav_path = file_path.rsplit('.', 1)[0] + ".wav"
        audio.export(wav_path, format="wav")
        return wav_path
    return file_path

def preprocess_audio(file_path: str) -> str:
    """
    Preprocesses the audio file to ensure compatibility with the AI model.

    Args:
        file_path (str): Path to the uploaded audio file.

    Returns:
        str: Path to the preprocessed audio file.
    """
    file_path = convert_to_wav(file_path)  # Convert to WAV if necessary
    y, sr = librosa.load(file_path, sr=16000)  # Resample audio to 16kHz
    processed_path = file_path.replace(".mp3", "_processed.wav").replace(".wav", "_processed.wav")
    sf.write(processed_path, y, sr)  # Save the resampled audio
    return processed_path

def process_files_with_live_updates(
    files: List[gr.File], 
    model_option: str, 
    output_format: str
) -> Generator[Tuple[str, List[str]], None, None]:
    """
    Processes a list of uploaded files, transcribes audio, and provides live updates.

    Args:
        files (List[gr.File]): List of files uploaded by the user.
        model_option (str): Selected model option.
        output_format (str): Selected output format option.

    Yields:
        Tuple[str, List[str]]: Updated status message and list of processed file paths.
    """
    global speech_to_text
    speech_to_text = load_model(model_option)

    file_details = []
    total_files = len(files)
    output_files = []

    # Create a folder to temporarily store output files
    output_dir = "output_files"
    os.makedirs(output_dir, exist_ok=True)

    for idx, file in enumerate(files):
        # Preprocess audio file
        preprocessed_path = preprocess_audio(file.name)
        
        # Transcribe audio using the AI model with timestamp support
        transcription_result = speech_to_text(preprocessed_path, return_timestamps=True)
        transcription = transcription_result["text"]

        # Save transcription to file
        txt_filename = os.path.join(output_dir, f"transcription_{file.name.split('/')[-1].split('.')[0]}.txt")
        with open(txt_filename, "w", encoding="utf-8") as txt_file:
            txt_file.write(transcription)
        output_files.append(txt_filename)

        # Add to file details
        detail = (
            f"**File Name**: {file.name.split('/')[-1]}<br>"
            f"**File Date**: {get_file_creation_date(file)}<br>"
            f"**Options**: {model_option} - {output_format}<br>"
            f"**Transcription**: {transcription}<br><br>"
        )
        file_details.append(detail)

        # Update progress bar and yield the updated Markdown
        yield (
            f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details),
            output_files,
        )

    # Create a zip archive
    zip_filename = os.path.join(output_dir, "output_files.zip")
    with zipfile.ZipFile(zip_filename, "w") as zipf:
        for file_path in output_files:
            zipf.write(file_path, os.path.basename(file_path))
    output_files.append(zip_filename)

    # Final yield
    yield (
        f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details),
        output_files,
    )

# Gradio app layout
with gr.Blocks() as demo:

    # Title and Description
    gr.Markdown("# Speech-to-Text Batch Processor (German)")
    gr.Markdown(
        """
        Upload multiple audio files (.wav, .mp3, .m4a, .aac), select desired processing options (i.e. the model), and view real-time updates as files are transcribed.
        The application uses advanced AI models for sequential speech-to-text translation.
        """
    )

    # Input section
    with gr.Row():
        with gr.Column():
            file_input = gr.Files(file_types=[".wav", ".mp3", ".m4a", ".aac"], label="Upload your audio files")
        with gr.Column():
            model_dropdown = gr.Dropdown(
                choices=[
                    "primeline/whisper-large-v3-german",
                    "primeline/whisper-tiny-german-1224", 
                    "primeline/whisper-tiny-german" 
                    ],
                label="Select Model",
                value="primeline/whisper-large-v3-german",
            )
            dropdown_2 = gr.Dropdown(
                choices=["Format: Plain Text"],
                label="Select Output Format",
                value="Format: Plain Text",
            )

    # Buttons
    with gr.Row():
        submit_button = gr.Button("Start Transcription")
        clear_button = gr.Button("Clear")

    # Output section
    output_md = gr.Markdown(label="Transcription Progress", value=STANDARD_OUTPUT_TEXT)
    output_files = gr.Files(label="Generated Output Files")

    # Button actions
    submit_button.click(
        process_files_with_live_updates,
        inputs=[file_input, model_dropdown, dropdown_2],
        outputs=[output_md, output_files],
    )

    clear_button.click(
        lambda: (None, "primeline/whisper-large-v3-german", "Format: Plain Text", STANDARD_OUTPUT_TEXT, None),
        inputs=[],  # No inputs
        outputs=[file_input, model_dropdown, dropdown_2, output_md, output_files],
    )

    gr.Image("Fraunhofer-IPA-Logo.jpg", show_label=False)

    # Centered Footer with Logo and Licensing Text
    with gr.Row():
        gr.Markdown(
            """
            **Fraunhofer IPA**  
            This application is provided under a basic licensing agreement for non-commercial use only.  
            For inquiries, visit [Fraunhofer IPA](https://www.ipa.fraunhofer.de).
            """,
            elem_id="footer-markdown",
        )

# CSS to center the footer content
demo.css = """
#footer-markdown {
    text-align: center;
    margin-top: 20px;
    padding-top: 10px;
    border-top: 1px solid #ccc;
}
"""

# Launch app
demo.launch()