File size: 4,315 Bytes
6e2e5b1
84494b4
6e2e5b1
 
 
 
 
 
84494b4
6e2e5b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d5a033
d2d532c
6e2e5b1
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import gradio as gr
import yt_dlp
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import torchaudio
from pydub import AudioSegment
from pydub.silence import split_on_silence


def download_video(url):
    """Download video and extract audio.

    :param url: The URL of the video to download.
    :return: Path to the downloaded audio file.
    """
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'outtmpl': f"downloaded_audio.%(ext)s",
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

    return f"downloaded_audio.wav"


def split_audio(audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=200, max_length=30000) -> list:
    """
    Splits the audio file into chunks at points of silence.

    :param audio_file: Path to the audio file.
    :param min_silence_len: Minimum length of silence (in ms) to consider it as a split point.
    :param silence_thresh: Silence threshold (in dB).
    :param keep_silence: Amount of silence (in ms) to leave at the beginning and end of each chunk.
    :param max_length: Maximum length of each chunk (in ms).
    :return: List of paths to the audio chunks.
    """
    # Load the audio file
    audio = AudioSegment.from_file(audio_file)

    # Split the audio file into chunks at points of silence
    chunks = split_on_silence(
        audio,
        min_silence_len=min_silence_len,
        silence_thresh=silence_thresh,
        keep_silence=keep_silence
    )

    # Further split chunks if they are too long
    split_chunks = []
    for i, chunk in enumerate(chunks):
        if len(chunk) <= max_length:
            split_chunks.append(chunk)
        else:
            split_chunks.extend(chunk[::max_length])

    # Export the chunks to files
    chunk_filenames = []
    for i, chunk in enumerate(split_chunks):
        chunk_name = f"chunk{i}.wav"
        chunk.export(chunk_name, format="wav")
        chunk_filenames.append(chunk_name)

    return chunk_filenames


# Hugging Face
# Load the model and processor
processor = AutoProcessor.from_pretrained("GroupSix/whisper-small-sv")
model = AutoModelForSpeechSeq2Seq.from_pretrained("GroupSix/whisper-small-sv")


def transcribe_audio(segment, num_segments):
    print(f"Current segment: {segment} (out of {num_segments})")

    # Load the audio file
    waveform, sample_rate = torchaudio.load(segment)

    # Resample if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Run the model
    inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")

    # Generate the transcription
    with torch.no_grad():
        generated_ids = model.generate(**inputs)

    # Decode the output and select the first transcription
    decoded_output = processor.batch_decode(generated_ids, skip_special_tokens=True)
    transcription = decoded_output[0] if decoded_output else ""

    return transcription


# Main function to tie everything together
def process_video(url):
    # Download and split the audio
    audio_file = download_video(url)
    segments = split_audio(audio_file)

    # Transcribe each segment
    transcriptions = [transcribe_audio(segment, len(segments)) for segment in segments]

    # Delete the audio file and the chunks
    os.remove(audio_file)
    for segment in segments:
        os.remove(segment)

    return transcriptions


# Gradio interface
iface = gr.Interface(
    fn=process_video,
    inputs=gr.Textbox(label="Swedish YouTube Video URL"),
    outputs=gr.Textbox(label="Transcriptions"),
    examples=[
        ["https://www.youtube.com/watch?v=hcxwTgEC7IM"],  # Fred på jorden
        ["https://www.youtube.com/watch?v=AzlipxrzMe4"],  # Jerry talar spanska
        ["https://www.youtube.com/watch?v=H_16_5kGh3I"],  # Det heter näsa, inte nos!
        # ["https://www.youtube.com/watch?v=v2m4V6FUseQ"],  # Ove blir arg på pantsystemet
        # ["https://www.youtube.com/watch?v=oA5QJHBNQkU"],  # Hur mår björnen egentligen? (takes too long)
    ]
)

iface.launch()