Spaces:

GroupSix
/

language

Sleeping

File size: 4,315 Bytes

import os
import gradio as gr
import yt_dlp
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import torchaudio
from pydub import AudioSegment
from pydub.silence import split_on_silence


def download_video(url):
    """Download video and extract audio.

    :param url: The URL of the video to download.
    :return: Path to the downloaded audio file.
    """
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'outtmpl': f"downloaded_audio.%(ext)s",
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

    return f"downloaded_audio.wav"


def split_audio(audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=200, max_length=30000) -> list:
    """
    Splits the audio file into chunks at points of silence.

    :param audio_file: Path to the audio file.
    :param min_silence_len: Minimum length of silence (in ms) to consider it as a split point.
    :param silence_thresh: Silence threshold (in dB).
    :param keep_silence: Amount of silence (in ms) to leave at the beginning and end of each chunk.
    :param max_length: Maximum length of each chunk (in ms).
    :return: List of paths to the audio chunks.
    """
    # Load the audio file
    audio = AudioSegment.from_file(audio_file)

    # Split the audio file into chunks at points of silence
    chunks = split_on_silence(
        audio,
        min_silence_len=min_silence_len,
        silence_thresh=silence_thresh,
        keep_silence=keep_silence
    )

    # Further split chunks if they are too long
    split_chunks = []
    for i, chunk in enumerate(chunks):
        if len(chunk) <= max_length:
            split_chunks.append(chunk)
        else:
            split_chunks.extend(chunk[::max_length])

    # Export the chunks to files
    chunk_filenames = []
    for i, chunk in enumerate(split_chunks):
        chunk_name = f"chunk{i}.wav"
        chunk.export(chunk_name, format="wav")
        chunk_filenames.append(chunk_name)

    return chunk_filenames


# Hugging Face
# Load the model and processor
processor = AutoProcessor.from_pretrained("GroupSix/whisper-small-sv")
model = AutoModelForSpeechSeq2Seq.from_pretrained("GroupSix/whisper-small-sv")


def transcribe_audio(segment, num_segments):
    print(f"Current segment: {segment} (out of {num_segments})")

    # Load the audio file
    waveform, sample_rate = torchaudio.load(segment)

    # Resample if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Run the model
    inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")

    # Generate the transcription
    with torch.no_grad():
        generated_ids = model.generate(**inputs)

    # Decode the output and select the first transcription
    decoded_output = processor.batch_decode(generated_ids, skip_special_tokens=True)
    transcription = decoded_output[0] if decoded_output else ""

    return transcription


# Main function to tie everything together
def process_video(url):
    # Download and split the audio
    audio_file = download_video(url)
    segments = split_audio(audio_file)

    # Transcribe each segment
    transcriptions = [transcribe_audio(segment, len(segments)) for segment in segments]

    # Delete the audio file and the chunks
    os.remove(audio_file)
    for segment in segments:
        os.remove(segment)

    return transcriptions


# Gradio interface
iface = gr.Interface(
    fn=process_video,
    inputs=gr.Textbox(label="Swedish YouTube Video URL"),
    outputs=gr.Textbox(label="Transcriptions"),
    examples=[
        ["https://www.youtube.com/watch?v=hcxwTgEC7IM"],  # Fred på jorden
        ["https://www.youtube.com/watch?v=AzlipxrzMe4"],  # Jerry talar spanska
        ["https://www.youtube.com/watch?v=H_16_5kGh3I"],  # Det heter näsa, inte nos!
        # ["https://www.youtube.com/watch?v=v2m4V6FUseQ"],  # Ove blir arg på pantsystemet
        # ["https://www.youtube.com/watch?v=oA5QJHBNQkU"],  # Hur mår björnen egentligen? (takes too long)
    ]
)

iface.launch()