File size: 4,315 Bytes
6e2e5b1 84494b4 6e2e5b1 84494b4 6e2e5b1 6d5a033 d2d532c 6e2e5b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import os
import gradio as gr
import yt_dlp
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import torchaudio
from pydub import AudioSegment
from pydub.silence import split_on_silence
def download_video(url):
"""Download video and extract audio.
:param url: The URL of the video to download.
:return: Path to the downloaded audio file.
"""
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'wav',
'preferredquality': '192',
}],
'outtmpl': f"downloaded_audio.%(ext)s",
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
return f"downloaded_audio.wav"
def split_audio(audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=200, max_length=30000) -> list:
"""
Splits the audio file into chunks at points of silence.
:param audio_file: Path to the audio file.
:param min_silence_len: Minimum length of silence (in ms) to consider it as a split point.
:param silence_thresh: Silence threshold (in dB).
:param keep_silence: Amount of silence (in ms) to leave at the beginning and end of each chunk.
:param max_length: Maximum length of each chunk (in ms).
:return: List of paths to the audio chunks.
"""
# Load the audio file
audio = AudioSegment.from_file(audio_file)
# Split the audio file into chunks at points of silence
chunks = split_on_silence(
audio,
min_silence_len=min_silence_len,
silence_thresh=silence_thresh,
keep_silence=keep_silence
)
# Further split chunks if they are too long
split_chunks = []
for i, chunk in enumerate(chunks):
if len(chunk) <= max_length:
split_chunks.append(chunk)
else:
split_chunks.extend(chunk[::max_length])
# Export the chunks to files
chunk_filenames = []
for i, chunk in enumerate(split_chunks):
chunk_name = f"chunk{i}.wav"
chunk.export(chunk_name, format="wav")
chunk_filenames.append(chunk_name)
return chunk_filenames
# Hugging Face
# Load the model and processor
processor = AutoProcessor.from_pretrained("GroupSix/whisper-small-sv")
model = AutoModelForSpeechSeq2Seq.from_pretrained("GroupSix/whisper-small-sv")
def transcribe_audio(segment, num_segments):
print(f"Current segment: {segment} (out of {num_segments})")
# Load the audio file
waveform, sample_rate = torchaudio.load(segment)
# Resample if necessary
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)
# Run the model
inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
# Generate the transcription
with torch.no_grad():
generated_ids = model.generate(**inputs)
# Decode the output and select the first transcription
decoded_output = processor.batch_decode(generated_ids, skip_special_tokens=True)
transcription = decoded_output[0] if decoded_output else ""
return transcription
# Main function to tie everything together
def process_video(url):
# Download and split the audio
audio_file = download_video(url)
segments = split_audio(audio_file)
# Transcribe each segment
transcriptions = [transcribe_audio(segment, len(segments)) for segment in segments]
# Delete the audio file and the chunks
os.remove(audio_file)
for segment in segments:
os.remove(segment)
return transcriptions
# Gradio interface
iface = gr.Interface(
fn=process_video,
inputs=gr.Textbox(label="Swedish YouTube Video URL"),
outputs=gr.Textbox(label="Transcriptions"),
examples=[
["https://www.youtube.com/watch?v=hcxwTgEC7IM"], # Fred på jorden
["https://www.youtube.com/watch?v=AzlipxrzMe4"], # Jerry talar spanska
["https://www.youtube.com/watch?v=H_16_5kGh3I"], # Det heter näsa, inte nos!
# ["https://www.youtube.com/watch?v=v2m4V6FUseQ"], # Ove blir arg på pantsystemet
# ["https://www.youtube.com/watch?v=oA5QJHBNQkU"], # Hur mår björnen egentligen? (takes too long)
]
)
iface.launch()
|