Updated app at ons 6 dec 2023 19:51:42 CET
Browse files- app.py +133 -1
- requirements.txt +5 -1
app.py
CHANGED
@@ -1,3 +1,135 @@
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
import gradio as gr
|
3 |
+
import yt_dlp
|
4 |
+
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
5 |
+
import torch
|
6 |
+
import torchaudio
|
7 |
+
from pydub import AudioSegment
|
8 |
+
from pydub.silence import split_on_silence
|
9 |
|
10 |
+
|
11 |
+
def download_video(url):
|
12 |
+
"""Download video and extract audio.
|
13 |
+
|
14 |
+
:param url: The URL of the video to download.
|
15 |
+
:return: Path to the downloaded audio file.
|
16 |
+
"""
|
17 |
+
ydl_opts = {
|
18 |
+
'format': 'bestaudio/best',
|
19 |
+
'postprocessors': [{
|
20 |
+
'key': 'FFmpegExtractAudio',
|
21 |
+
'preferredcodec': 'wav',
|
22 |
+
'preferredquality': '192',
|
23 |
+
}],
|
24 |
+
'outtmpl': f"downloaded_audio.%(ext)s",
|
25 |
+
}
|
26 |
+
|
27 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
28 |
+
ydl.download([url])
|
29 |
+
|
30 |
+
return f"downloaded_audio.wav"
|
31 |
+
|
32 |
+
|
33 |
+
def split_audio(audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=200, max_length=30000) -> list:
|
34 |
+
"""
|
35 |
+
Splits the audio file into chunks at points of silence.
|
36 |
+
|
37 |
+
:param audio_file: Path to the audio file.
|
38 |
+
:param min_silence_len: Minimum length of silence (in ms) to consider it as a split point.
|
39 |
+
:param silence_thresh: Silence threshold (in dB).
|
40 |
+
:param keep_silence: Amount of silence (in ms) to leave at the beginning and end of each chunk.
|
41 |
+
:param max_length: Maximum length of each chunk (in ms).
|
42 |
+
:return: List of paths to the audio chunks.
|
43 |
+
"""
|
44 |
+
# Load the audio file
|
45 |
+
audio = AudioSegment.from_file(audio_file)
|
46 |
+
|
47 |
+
# Split the audio file into chunks at points of silence
|
48 |
+
chunks = split_on_silence(
|
49 |
+
audio,
|
50 |
+
min_silence_len=min_silence_len,
|
51 |
+
silence_thresh=silence_thresh,
|
52 |
+
keep_silence=keep_silence
|
53 |
+
)
|
54 |
+
|
55 |
+
# Further split chunks if they are too long
|
56 |
+
split_chunks = []
|
57 |
+
for i, chunk in enumerate(chunks):
|
58 |
+
if len(chunk) <= max_length:
|
59 |
+
split_chunks.append(chunk)
|
60 |
+
else:
|
61 |
+
split_chunks.extend(chunk[::max_length])
|
62 |
+
|
63 |
+
# Export the chunks to files
|
64 |
+
chunk_filenames = []
|
65 |
+
for i, chunk in enumerate(split_chunks):
|
66 |
+
chunk_name = f"chunk{i}.wav"
|
67 |
+
chunk.export(chunk_name, format="wav")
|
68 |
+
chunk_filenames.append(chunk_name)
|
69 |
+
|
70 |
+
return chunk_filenames
|
71 |
+
|
72 |
+
|
73 |
+
# Hugging Face
|
74 |
+
# Load the model and processor
|
75 |
+
processor = AutoProcessor.from_pretrained("GroupSix/whisper-small-sv")
|
76 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained("GroupSix/whisper-small-sv")
|
77 |
+
|
78 |
+
|
79 |
+
def transcribe_audio(segment, num_segments):
|
80 |
+
print(f"Current segment: {segment} (out of {num_segments})")
|
81 |
+
|
82 |
+
# Load the audio file
|
83 |
+
waveform, sample_rate = torchaudio.load(segment)
|
84 |
+
|
85 |
+
# Resample if necessary
|
86 |
+
if sample_rate != 16000:
|
87 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
88 |
+
waveform = resampler(waveform)
|
89 |
+
|
90 |
+
# Run the model
|
91 |
+
inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
|
92 |
+
|
93 |
+
# Generate the transcription
|
94 |
+
with torch.no_grad():
|
95 |
+
generated_ids = model.generate(**inputs)
|
96 |
+
|
97 |
+
# Decode the output and select the first transcription
|
98 |
+
decoded_output = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
99 |
+
transcription = decoded_output[0] if decoded_output else ""
|
100 |
+
|
101 |
+
return transcription
|
102 |
+
|
103 |
+
|
104 |
+
# Main function to tie everything together
|
105 |
+
def process_video(url):
|
106 |
+
# Download and split the audio
|
107 |
+
audio_file = download_video(url)
|
108 |
+
segments = split_audio(audio_file)
|
109 |
+
|
110 |
+
# Transcribe each segment
|
111 |
+
transcriptions = [transcribe_audio(segment, len(segments)) for segment in segments]
|
112 |
+
|
113 |
+
# Delete the audio file and the chunks
|
114 |
+
os.remove(audio_file)
|
115 |
+
for segment in segments:
|
116 |
+
os.remove(segment)
|
117 |
+
|
118 |
+
return transcriptions
|
119 |
+
|
120 |
+
|
121 |
+
# Gradio interface
|
122 |
+
iface = gr.Interface(
|
123 |
+
fn=process_video,
|
124 |
+
inputs=gr.Textbox(label="Swedish YouTube Video URL"),
|
125 |
+
outputs=gr.Textbox(label="Transcriptions"),
|
126 |
+
examples=[
|
127 |
+
["https://www.youtube.com/watch?v=hcxwTgEC7IM"], # Fred på jorden
|
128 |
+
["https://www.youtube.com/watch?v=AzlipxrzMe4"], # Jerry talar spanska
|
129 |
+
["https://www.youtube.com/watch?v=H_16_5kGh3I"], # Det heter näsa, inte nos!
|
130 |
+
["https://www.youtube.com/watch?v=v2m4V6FUseQ"], # Ove blir arg på pantsystemet
|
131 |
+
["https://www.youtube.com/watch?v=oA5QJHBNQkU"], # Hur mår björnen egentligen?
|
132 |
+
]
|
133 |
+
)
|
134 |
+
|
135 |
+
iface.launch()
|
requirements.txt
CHANGED
@@ -1,2 +1,6 @@
|
|
1 |
tensorflow
|
2 |
-
|
|
|
|
|
|
|
|
|
|
1 |
tensorflow
|
2 |
+
torch==2.1.1
|
3 |
+
torchaudio==2.1.1
|
4 |
+
torchvision==0.16.1
|
5 |
+
transformers==4.35.2
|
6 |
+
yt-dlp==2023.11.16
|