Spaces:
Running
Running
File size: 7,520 Bytes
696b78e e76c7d1 a6bff07 e76c7d1 a6bff07 e76c7d1 a6bff07 3f95d23 a6bff07 e76c7d1 a6bff07 e76c7d1 a6bff07 e76c7d1 a6bff07 e76c7d1 a6bff07 e76c7d1 a6bff07 8813f41 a6bff07 e76c7d1 a6bff07 e76c7d1 696b78e e76c7d1 696b78e e76c7d1 1f87b59 e76c7d1 1f87b59 e76c7d1 696b78e e76c7d1 3f95d23 e76c7d1 a6bff07 1f87b59 a6bff07 1f87b59 a6bff07 1f87b59 696b78e e76c7d1 a6bff07 e76c7d1 696b78e e76c7d1 a6bff07 e76c7d1 1f87b59 e76c7d1 a6bff07 696b78e e76c7d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
import gradio as gr
import time
import os
import zipfile
import torch
import librosa
import soundfile as sf
from transformers import pipeline
from typing import List, Tuple, Generator
import datetime
from pydub import AudioSegment
# Initial model name
MODEL_NAME = "primeline/whisper-tiny-german-1224"
speech_to_text = pipeline("automatic-speech-recognition", model=MODEL_NAME)
# Initial status message
STANDARD_OUTPUT_TEXT = "**Status:**<br>"
def get_file_creation_date(file_path: str) -> str:
"""
Returns the creation date of a file.
Args:
file_path (str): The path to the file.
Returns:
str: The creation date in a human-readable format.
"""
try:
# Get file statistics
file_stats = os.stat(file_path)
# Retrieve and format creation time
creation_time = datetime.datetime.fromtimestamp(file_stats.st_ctime)
return creation_time.strftime("%Y-%m-%d %H:%M:%S")
except FileNotFoundError:
return "File not found."
def load_model(model_name: str):
"""
Loads the selected Hugging Face model.
Args:
model_name (str): The name of the Hugging Face model to load.
Returns:
pipeline: The loaded model pipeline.
"""
return pipeline("automatic-speech-recognition", model=model_name)
def convert_to_wav(file_path: str) -> str:
"""
Converts audio files to WAV format if necessary.
Args:
file_path (str): Path to the uploaded audio file.
Returns:
str: Path to the converted WAV file.
"""
if file_path.endswith(".m4a") or file_path.endswith(".aac"):
audio = AudioSegment.from_file(file_path)
wav_path = file_path.rsplit('.', 1)[0] + ".wav"
audio.export(wav_path, format="wav")
return wav_path
return file_path
def preprocess_audio(file_path: str) -> str:
"""
Preprocesses the audio file to ensure compatibility with the AI model.
Args:
file_path (str): Path to the uploaded audio file.
Returns:
str: Path to the preprocessed audio file.
"""
file_path = convert_to_wav(file_path) # Convert to WAV if necessary
y, sr = librosa.load(file_path, sr=16000) # Resample audio to 16kHz
processed_path = file_path.replace(".mp3", "_processed.wav").replace(".wav", "_processed.wav")
sf.write(processed_path, y, sr) # Save the resampled audio
return processed_path
def process_files_with_live_updates(
files: List[gr.File],
model_option: str,
output_format: str
) -> Generator[Tuple[str, List[str]], None, None]:
"""
Processes a list of uploaded files, transcribes audio, and provides live updates.
Args:
files (List[gr.File]): List of files uploaded by the user.
model_option (str): Selected model option.
output_format (str): Selected output format option.
Yields:
Tuple[str, List[str]]: Updated status message and list of processed file paths.
"""
global speech_to_text
speech_to_text = load_model(model_option)
file_details = []
total_files = len(files)
output_files = []
# Create a folder to temporarily store output files
output_dir = "output_files"
os.makedirs(output_dir, exist_ok=True)
for idx, file in enumerate(files):
# Preprocess audio file
preprocessed_path = preprocess_audio(file.name)
# Transcribe audio using the AI model with timestamp support
transcription_result = speech_to_text(preprocessed_path, return_timestamps=True)
transcription = transcription_result["text"]
# Save transcription to file
txt_filename = os.path.join(output_dir, f"transcription_{file.name.split('/')[-1].split('.')[0]}.txt")
with open(txt_filename, "w", encoding="utf-8") as txt_file:
txt_file.write(transcription)
output_files.append(txt_filename)
# Add to file details
detail = (
f"**File Name**: {file.name.split('/')[-1]}<br>"
f"**File Date**: {get_file_creation_date(file)}<br>"
f"**Options**: {model_option} - {output_format}<br>"
f"**Transcription**: {transcription}<br><br>"
)
file_details.append(detail)
# Update progress bar and yield the updated Markdown
yield (
f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details),
output_files,
)
# Create a zip archive
zip_filename = os.path.join(output_dir, "output_files.zip")
with zipfile.ZipFile(zip_filename, "w") as zipf:
for file_path in output_files:
zipf.write(file_path, os.path.basename(file_path))
output_files.append(zip_filename)
# Final yield
yield (
f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details),
output_files,
)
# Gradio app layout
with gr.Blocks() as demo:
# Title and Description
gr.Markdown("# Speech-to-Text Batch Processor (German)")
gr.Markdown(
"""
Upload multiple audio files (.wav, .mp3, .m4a, .aac), select desired processing options (i.e. the model), and view real-time updates as files are transcribed.
The application uses advanced AI models for sequential speech-to-text translation.
"""
)
# Input section
with gr.Row():
with gr.Column():
file_input = gr.Files(file_types=[".wav", ".mp3", ".m4a", ".aac"], label="Upload your audio files")
with gr.Column():
model_dropdown = gr.Dropdown(
choices=[
"primeline/whisper-large-v3-german",
"primeline/whisper-tiny-german-1224",
"primeline/whisper-tiny-german"
],
label="Select Model",
value="primeline/whisper-large-v3-german",
)
dropdown_2 = gr.Dropdown(
choices=["Format: Plain Text"],
label="Select Output Format",
value="Format: Plain Text",
)
# Buttons
with gr.Row():
submit_button = gr.Button("Start Transcription")
clear_button = gr.Button("Clear")
# Output section
output_md = gr.Markdown(label="Transcription Progress", value=STANDARD_OUTPUT_TEXT)
output_files = gr.Files(label="Generated Output Files")
# Button actions
submit_button.click(
process_files_with_live_updates,
inputs=[file_input, model_dropdown, dropdown_2],
outputs=[output_md, output_files],
)
clear_button.click(
lambda: (None, "primeline/whisper-large-v3-german", "Format: Plain Text", STANDARD_OUTPUT_TEXT, None),
inputs=[], # No inputs
outputs=[file_input, model_dropdown, dropdown_2, output_md, output_files],
)
gr.Image("Fraunhofer-IPA-Logo.jpg", show_label=False)
# Centered Footer with Logo and Licensing Text
with gr.Row():
gr.Markdown(
"""
**Fraunhofer IPA**
This application is provided under a basic licensing agreement for non-commercial use only.
For inquiries, visit [Fraunhofer IPA](https://www.ipa.fraunhofer.de).
""",
elem_id="footer-markdown",
)
# CSS to center the footer content
demo.css = """
#footer-markdown {
text-align: center;
margin-top: 20px;
padding-top: 10px;
border-top: 1px solid #ccc;
}
"""
# Launch app
demo.launch()
|