File size: 7,520 Bytes
696b78e
e76c7d1
 
 
a6bff07
 
 
 
e76c7d1
a6bff07
 
 
 
 
 
e76c7d1
 
 
 
a6bff07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f95d23
 
 
a6bff07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e76c7d1
 
a6bff07
 
e76c7d1
 
a6bff07
e76c7d1
 
 
a6bff07
 
e76c7d1
 
 
 
a6bff07
 
 
e76c7d1
 
 
 
 
 
 
 
 
a6bff07
 
8813f41
a6bff07
 
 
 
 
 
 
 
 
e76c7d1
 
 
a6bff07
 
 
 
e76c7d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696b78e
e76c7d1
 
696b78e
e76c7d1
1f87b59
e76c7d1
 
1f87b59
e76c7d1
 
 
696b78e
e76c7d1
 
 
3f95d23
e76c7d1
a6bff07
 
1f87b59
a6bff07
1f87b59
 
a6bff07
1f87b59
696b78e
e76c7d1
a6bff07
e76c7d1
 
696b78e
 
e76c7d1
 
 
 
 
 
 
 
 
 
 
 
a6bff07
e76c7d1
 
 
 
1f87b59
e76c7d1
a6bff07
696b78e
 
e76c7d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import gradio as gr
import time
import os
import zipfile
import torch
import librosa
import soundfile as sf
from transformers import pipeline
from typing import List, Tuple, Generator
import datetime
from pydub import AudioSegment

# Initial model name
MODEL_NAME = "primeline/whisper-tiny-german-1224"
speech_to_text = pipeline("automatic-speech-recognition", model=MODEL_NAME)

# Initial status message
STANDARD_OUTPUT_TEXT = "**Status:**<br>"

def get_file_creation_date(file_path: str) -> str:
    """
    Returns the creation date of a file.

    Args:
        file_path (str): The path to the file.

    Returns:
        str: The creation date in a human-readable format.
    """
    try:
        # Get file statistics
        file_stats = os.stat(file_path)
        
        # Retrieve and format creation time
        creation_time = datetime.datetime.fromtimestamp(file_stats.st_ctime)
        return creation_time.strftime("%Y-%m-%d %H:%M:%S")
    except FileNotFoundError:
        return "File not found."

def load_model(model_name: str):
    """
    Loads the selected Hugging Face model.

    Args:
        model_name (str): The name of the Hugging Face model to load.

    Returns:
        pipeline: The loaded model pipeline.
    """
    return pipeline("automatic-speech-recognition", model=model_name)

def convert_to_wav(file_path: str) -> str:
    """
    Converts audio files to WAV format if necessary.

    Args:
        file_path (str): Path to the uploaded audio file.

    Returns:
        str: Path to the converted WAV file.
    """
    if file_path.endswith(".m4a") or file_path.endswith(".aac"):
        audio = AudioSegment.from_file(file_path)
        wav_path = file_path.rsplit('.', 1)[0] + ".wav"
        audio.export(wav_path, format="wav")
        return wav_path
    return file_path

def preprocess_audio(file_path: str) -> str:
    """
    Preprocesses the audio file to ensure compatibility with the AI model.

    Args:
        file_path (str): Path to the uploaded audio file.

    Returns:
        str: Path to the preprocessed audio file.
    """
    file_path = convert_to_wav(file_path)  # Convert to WAV if necessary
    y, sr = librosa.load(file_path, sr=16000)  # Resample audio to 16kHz
    processed_path = file_path.replace(".mp3", "_processed.wav").replace(".wav", "_processed.wav")
    sf.write(processed_path, y, sr)  # Save the resampled audio
    return processed_path

def process_files_with_live_updates(
    files: List[gr.File], 
    model_option: str, 
    output_format: str
) -> Generator[Tuple[str, List[str]], None, None]:
    """
    Processes a list of uploaded files, transcribes audio, and provides live updates.

    Args:
        files (List[gr.File]): List of files uploaded by the user.
        model_option (str): Selected model option.
        output_format (str): Selected output format option.

    Yields:
        Tuple[str, List[str]]: Updated status message and list of processed file paths.
    """
    global speech_to_text
    speech_to_text = load_model(model_option)

    file_details = []
    total_files = len(files)
    output_files = []

    # Create a folder to temporarily store output files
    output_dir = "output_files"
    os.makedirs(output_dir, exist_ok=True)

    for idx, file in enumerate(files):
        # Preprocess audio file
        preprocessed_path = preprocess_audio(file.name)
        
        # Transcribe audio using the AI model with timestamp support
        transcription_result = speech_to_text(preprocessed_path, return_timestamps=True)
        transcription = transcription_result["text"]

        # Save transcription to file
        txt_filename = os.path.join(output_dir, f"transcription_{file.name.split('/')[-1].split('.')[0]}.txt")
        with open(txt_filename, "w", encoding="utf-8") as txt_file:
            txt_file.write(transcription)
        output_files.append(txt_filename)

        # Add to file details
        detail = (
            f"**File Name**: {file.name.split('/')[-1]}<br>"
            f"**File Date**: {get_file_creation_date(file)}<br>"
            f"**Options**: {model_option} - {output_format}<br>"
            f"**Transcription**: {transcription}<br><br>"
        )
        file_details.append(detail)

        # Update progress bar and yield the updated Markdown
        yield (
            f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details),
            output_files,
        )

    # Create a zip archive
    zip_filename = os.path.join(output_dir, "output_files.zip")
    with zipfile.ZipFile(zip_filename, "w") as zipf:
        for file_path in output_files:
            zipf.write(file_path, os.path.basename(file_path))
    output_files.append(zip_filename)

    # Final yield
    yield (
        f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details),
        output_files,
    )

# Gradio app layout
with gr.Blocks() as demo:

    # Title and Description
    gr.Markdown("# Speech-to-Text Batch Processor (German)")
    gr.Markdown(
        """
        Upload multiple audio files (.wav, .mp3, .m4a, .aac), select desired processing options (i.e. the model), and view real-time updates as files are transcribed.
        The application uses advanced AI models for sequential speech-to-text translation.
        """
    )

    # Input section
    with gr.Row():
        with gr.Column():
            file_input = gr.Files(file_types=[".wav", ".mp3", ".m4a", ".aac"], label="Upload your audio files")
        with gr.Column():
            model_dropdown = gr.Dropdown(
                choices=[
                    "primeline/whisper-large-v3-german",
                    "primeline/whisper-tiny-german-1224", 
                    "primeline/whisper-tiny-german" 
                    ],
                label="Select Model",
                value="primeline/whisper-large-v3-german",
            )
            dropdown_2 = gr.Dropdown(
                choices=["Format: Plain Text"],
                label="Select Output Format",
                value="Format: Plain Text",
            )

    # Buttons
    with gr.Row():
        submit_button = gr.Button("Start Transcription")
        clear_button = gr.Button("Clear")

    # Output section
    output_md = gr.Markdown(label="Transcription Progress", value=STANDARD_OUTPUT_TEXT)
    output_files = gr.Files(label="Generated Output Files")

    # Button actions
    submit_button.click(
        process_files_with_live_updates,
        inputs=[file_input, model_dropdown, dropdown_2],
        outputs=[output_md, output_files],
    )

    clear_button.click(
        lambda: (None, "primeline/whisper-large-v3-german", "Format: Plain Text", STANDARD_OUTPUT_TEXT, None),
        inputs=[],  # No inputs
        outputs=[file_input, model_dropdown, dropdown_2, output_md, output_files],
    )

    gr.Image("Fraunhofer-IPA-Logo.jpg", show_label=False)

    # Centered Footer with Logo and Licensing Text
    with gr.Row():
        gr.Markdown(
            """
            **Fraunhofer IPA**  
            This application is provided under a basic licensing agreement for non-commercial use only.  
            For inquiries, visit [Fraunhofer IPA](https://www.ipa.fraunhofer.de).
            """,
            elem_id="footer-markdown",
        )

# CSS to center the footer content
demo.css = """
#footer-markdown {
    text-align: center;
    margin-top: 20px;
    padding-top: 10px;
    border-top: 1px solid #ccc;
}
"""

# Launch app
demo.launch()