language / app.py
martenb's picture
Updated app at sön 17 dec 2023 13:16:29 CET
be31921
import os
import gradio as gr
import yt_dlp
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import torchaudio
from pydub import AudioSegment
from pydub.silence import split_on_silence
def download_video(url):
"""Download video and extract audio.
:param url: The URL of the video to download.
:return: Path to the downloaded audio file.
"""
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'wav',
'preferredquality': '192',
}],
'outtmpl': f"downloaded_audio.%(ext)s",
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
return f"downloaded_audio.wav"
def split_audio(audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=200, max_length=30000) -> list:
"""
Splits the audio file into chunks at points of silence.
:param audio_file: Path to the audio file.
:param min_silence_len: Minimum length of silence (in ms) to consider it as a split point.
:param silence_thresh: Silence threshold (in dB).
:param keep_silence: Amount of silence (in ms) to leave at the beginning and end of each chunk.
:param max_length: Maximum length of each chunk (in ms).
:return: List of paths to the audio chunks.
"""
# Load the audio file
audio = AudioSegment.from_file(audio_file)
# Split the audio file into chunks at points of silence
chunks = split_on_silence(
audio,
min_silence_len=min_silence_len,
silence_thresh=silence_thresh,
keep_silence=keep_silence
)
# Further split chunks if they are too long
split_chunks = []
for i, chunk in enumerate(chunks):
if len(chunk) <= max_length:
split_chunks.append(chunk)
else:
split_chunks.extend(chunk[::max_length])
# Export the chunks to files
chunk_filenames = []
for i, chunk in enumerate(split_chunks):
chunk_name = f"chunk{i}.wav"
chunk.export(chunk_name, format="wav")
chunk_filenames.append(chunk_name)
return chunk_filenames
# Hugging Face
# Load the model and processor
processor = AutoProcessor.from_pretrained("GroupSix/whisper-small-sv")
model = AutoModelForSpeechSeq2Seq.from_pretrained("GroupSix/whisper-small-sv")
def transcribe_audio(segment, num_segments):
print(f"Current segment: {segment} (out of {num_segments})")
# Load the audio file
waveform, sample_rate = torchaudio.load(segment)
# Resample if necessary
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)
# Run the model
inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
# Generate the transcription
with torch.no_grad():
generated_ids = model.generate(**inputs)
# Decode the output and select the first transcription
decoded_output = processor.batch_decode(generated_ids, skip_special_tokens=True)
transcription = decoded_output[0] if decoded_output else ""
return transcription
# Main function to tie everything together
def process_video(url):
# Download and split the audio
audio_file = download_video(url)
segments = split_audio(audio_file)
# Transcribe each segment
transcriptions = [transcribe_audio(segment, len(segments)) for segment in segments]
# Delete the audio file and the chunks
os.remove(audio_file)
for segment in segments:
os.remove(segment)
return transcriptions
# Gradio interface
iface = gr.Interface(
fn=process_video,
inputs=gr.Textbox(label="Swedish YouTube Video URL"),
outputs=gr.Textbox(label="Transcriptions"),
# examples=[
# # ["https://www.youtube.com/watch?v=hcxwTgEC7IM"], # Fred på jorden
# # ["https://www.youtube.com/watch?v=AzlipxrzMe4"], # Jerry talar spanska
# # ["https://www.youtube.com/watch?v=H_16_5kGh3I"], # Det heter näsa, inte nos!
# # ["https://www.youtube.com/watch?v=v2m4V6FUseQ"], # Ove blir arg på pantsystemet
# # ["https://www.youtube.com/watch?v=oA5QJHBNQkU"], # Hur mår björnen egentligen? (takes too long)
# ]
)
iface.launch()