|
import os |
|
import gradio as gr |
|
import yt_dlp |
|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq |
|
import torch |
|
import torchaudio |
|
from pydub import AudioSegment |
|
from pydub.silence import split_on_silence |
|
|
|
|
|
def download_video(url): |
|
"""Download video and extract audio. |
|
|
|
:param url: The URL of the video to download. |
|
:return: Path to the downloaded audio file. |
|
""" |
|
ydl_opts = { |
|
'format': 'bestaudio/best', |
|
'postprocessors': [{ |
|
'key': 'FFmpegExtractAudio', |
|
'preferredcodec': 'wav', |
|
'preferredquality': '192', |
|
}], |
|
'outtmpl': f"downloaded_audio.%(ext)s", |
|
} |
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
|
ydl.download([url]) |
|
|
|
return f"downloaded_audio.wav" |
|
|
|
|
|
def split_audio(audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=200, max_length=30000) -> list: |
|
""" |
|
Splits the audio file into chunks at points of silence. |
|
|
|
:param audio_file: Path to the audio file. |
|
:param min_silence_len: Minimum length of silence (in ms) to consider it as a split point. |
|
:param silence_thresh: Silence threshold (in dB). |
|
:param keep_silence: Amount of silence (in ms) to leave at the beginning and end of each chunk. |
|
:param max_length: Maximum length of each chunk (in ms). |
|
:return: List of paths to the audio chunks. |
|
""" |
|
|
|
audio = AudioSegment.from_file(audio_file) |
|
|
|
|
|
chunks = split_on_silence( |
|
audio, |
|
min_silence_len=min_silence_len, |
|
silence_thresh=silence_thresh, |
|
keep_silence=keep_silence |
|
) |
|
|
|
|
|
split_chunks = [] |
|
for i, chunk in enumerate(chunks): |
|
if len(chunk) <= max_length: |
|
split_chunks.append(chunk) |
|
else: |
|
split_chunks.extend(chunk[::max_length]) |
|
|
|
|
|
chunk_filenames = [] |
|
for i, chunk in enumerate(split_chunks): |
|
chunk_name = f"chunk{i}.wav" |
|
chunk.export(chunk_name, format="wav") |
|
chunk_filenames.append(chunk_name) |
|
|
|
return chunk_filenames |
|
|
|
|
|
|
|
|
|
processor = AutoProcessor.from_pretrained("GroupSix/whisper-small-sv") |
|
model = AutoModelForSpeechSeq2Seq.from_pretrained("GroupSix/whisper-small-sv") |
|
|
|
|
|
def transcribe_audio(segment, num_segments): |
|
print(f"Current segment: {segment} (out of {num_segments})") |
|
|
|
|
|
waveform, sample_rate = torchaudio.load(segment) |
|
|
|
|
|
if sample_rate != 16000: |
|
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) |
|
waveform = resampler(waveform) |
|
|
|
|
|
inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
generated_ids = model.generate(**inputs) |
|
|
|
|
|
decoded_output = processor.batch_decode(generated_ids, skip_special_tokens=True) |
|
transcription = decoded_output[0] if decoded_output else "" |
|
|
|
return transcription |
|
|
|
|
|
|
|
def process_video(url): |
|
|
|
audio_file = download_video(url) |
|
segments = split_audio(audio_file) |
|
|
|
|
|
transcriptions = [transcribe_audio(segment, len(segments)) for segment in segments] |
|
|
|
|
|
os.remove(audio_file) |
|
for segment in segments: |
|
os.remove(segment) |
|
|
|
return transcriptions |
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_video, |
|
inputs=gr.Textbox(label="Swedish YouTube Video URL"), |
|
outputs=gr.Textbox(label="Transcriptions"), |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
) |
|
|
|
iface.launch() |
|
|