Spaces:

GroupSix
/

language

Sleeping

App Files Files Community

martenb commited on Dec 6, 2023

Commit

6e2e5b1

1 Parent(s): 6863878

Updated app at ons 6 dec 2023 19:51:42 CET

Browse files

Files changed (2) hide show

app.py +133 -1
requirements.txt +5 -1

app.py CHANGED Viewed

@@ -1,3 +1,135 @@
 import gradio as gr
-gr.load("models/martenb/whisper-small-sv").launch()

+import os
 import gradio as gr
+import yt_dlp
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+import torch
+import torchaudio
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+def download_video(url):
+    """Download video and extract audio.
+    :param url: The URL of the video to download.
+    :return: Path to the downloaded audio file.
+    """
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'wav',
+            'preferredquality': '192',
+        }],
+        'outtmpl': f"downloaded_audio.%(ext)s",
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([url])
+    return f"downloaded_audio.wav"
+def split_audio(audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=200, max_length=30000) -> list:
+    """
+    Splits the audio file into chunks at points of silence.
+    :param audio_file: Path to the audio file.
+    :param min_silence_len: Minimum length of silence (in ms) to consider it as a split point.
+    :param silence_thresh: Silence threshold (in dB).
+    :param keep_silence: Amount of silence (in ms) to leave at the beginning and end of each chunk.
+    :param max_length: Maximum length of each chunk (in ms).
+    :return: List of paths to the audio chunks.
+    """
+    # Load the audio file
+    audio = AudioSegment.from_file(audio_file)
+    # Split the audio file into chunks at points of silence
+    chunks = split_on_silence(
+        audio,
+        min_silence_len=min_silence_len,
+        silence_thresh=silence_thresh,
+        keep_silence=keep_silence
+    )
+    # Further split chunks if they are too long
+    split_chunks = []
+    for i, chunk in enumerate(chunks):
+        if len(chunk) <= max_length:
+            split_chunks.append(chunk)
+        else:
+            split_chunks.extend(chunk[::max_length])
+    # Export the chunks to files
+    chunk_filenames = []
+    for i, chunk in enumerate(split_chunks):
+        chunk_name = f"chunk{i}.wav"
+        chunk.export(chunk_name, format="wav")
+        chunk_filenames.append(chunk_name)
+    return chunk_filenames
+# Hugging Face
+# Load the model and processor
+processor = AutoProcessor.from_pretrained("GroupSix/whisper-small-sv")
+model = AutoModelForSpeechSeq2Seq.from_pretrained("GroupSix/whisper-small-sv")
+def transcribe_audio(segment, num_segments):
+    print(f"Current segment: {segment} (out of {num_segments})")
+    # Load the audio file
+    waveform, sample_rate = torchaudio.load(segment)
+    # Resample if necessary
+    if sample_rate != 16000:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+        waveform = resampler(waveform)
+    # Run the model
+    inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
+    # Generate the transcription
+    with torch.no_grad():
+        generated_ids = model.generate(**inputs)
+    # Decode the output and select the first transcription
+    decoded_output = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    transcription = decoded_output[0] if decoded_output else ""
+    return transcription
+# Main function to tie everything together
+def process_video(url):
+    # Download and split the audio
+    audio_file = download_video(url)
+    segments = split_audio(audio_file)
+    # Transcribe each segment
+    transcriptions = [transcribe_audio(segment, len(segments)) for segment in segments]
+    # Delete the audio file and the chunks
+    os.remove(audio_file)
+    for segment in segments:
+        os.remove(segment)
+    return transcriptions
+# Gradio interface
+iface = gr.Interface(
+    fn=process_video,
+    inputs=gr.Textbox(label="Swedish YouTube Video URL"),
+    outputs=gr.Textbox(label="Transcriptions"),
+    examples=[
+        ["https://www.youtube.com/watch?v=hcxwTgEC7IM"],  # Fred på jorden
+        ["https://www.youtube.com/watch?v=AzlipxrzMe4"],  # Jerry talar spanska
+        ["https://www.youtube.com/watch?v=H_16_5kGh3I"],  # Det heter näsa, inte nos!
+        ["https://www.youtube.com/watch?v=v2m4V6FUseQ"],  # Ove blir arg på pantsystemet
+        ["https://www.youtube.com/watch?v=oA5QJHBNQkU"],  # Hur mår björnen egentligen?
+    ]
+)
+iface.launch()

requirements.txt CHANGED Viewed

@@ -1,2 +1,6 @@
 tensorflow
-transformers==4.35.2

 tensorflow
+torch==2.1.1
+torchaudio==2.1.1
+torchvision==0.16.1
+transformers==4.35.2
+yt-dlp==2023.11.16