martenb commited on
Commit
6e2e5b1
·
1 Parent(s): 6863878

Updated app at ons 6 dec 2023 19:51:42 CET

Browse files
Files changed (2) hide show
  1. app.py +133 -1
  2. requirements.txt +5 -1
app.py CHANGED
@@ -1,3 +1,135 @@
 
1
  import gradio as gr
 
 
 
 
 
 
2
 
3
- gr.load("models/martenb/whisper-small-sv").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import gradio as gr
3
+ import yt_dlp
4
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
5
+ import torch
6
+ import torchaudio
7
+ from pydub import AudioSegment
8
+ from pydub.silence import split_on_silence
9
 
10
+
11
+ def download_video(url):
12
+ """Download video and extract audio.
13
+
14
+ :param url: The URL of the video to download.
15
+ :return: Path to the downloaded audio file.
16
+ """
17
+ ydl_opts = {
18
+ 'format': 'bestaudio/best',
19
+ 'postprocessors': [{
20
+ 'key': 'FFmpegExtractAudio',
21
+ 'preferredcodec': 'wav',
22
+ 'preferredquality': '192',
23
+ }],
24
+ 'outtmpl': f"downloaded_audio.%(ext)s",
25
+ }
26
+
27
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
28
+ ydl.download([url])
29
+
30
+ return f"downloaded_audio.wav"
31
+
32
+
33
+ def split_audio(audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=200, max_length=30000) -> list:
34
+ """
35
+ Splits the audio file into chunks at points of silence.
36
+
37
+ :param audio_file: Path to the audio file.
38
+ :param min_silence_len: Minimum length of silence (in ms) to consider it as a split point.
39
+ :param silence_thresh: Silence threshold (in dB).
40
+ :param keep_silence: Amount of silence (in ms) to leave at the beginning and end of each chunk.
41
+ :param max_length: Maximum length of each chunk (in ms).
42
+ :return: List of paths to the audio chunks.
43
+ """
44
+ # Load the audio file
45
+ audio = AudioSegment.from_file(audio_file)
46
+
47
+ # Split the audio file into chunks at points of silence
48
+ chunks = split_on_silence(
49
+ audio,
50
+ min_silence_len=min_silence_len,
51
+ silence_thresh=silence_thresh,
52
+ keep_silence=keep_silence
53
+ )
54
+
55
+ # Further split chunks if they are too long
56
+ split_chunks = []
57
+ for i, chunk in enumerate(chunks):
58
+ if len(chunk) <= max_length:
59
+ split_chunks.append(chunk)
60
+ else:
61
+ split_chunks.extend(chunk[::max_length])
62
+
63
+ # Export the chunks to files
64
+ chunk_filenames = []
65
+ for i, chunk in enumerate(split_chunks):
66
+ chunk_name = f"chunk{i}.wav"
67
+ chunk.export(chunk_name, format="wav")
68
+ chunk_filenames.append(chunk_name)
69
+
70
+ return chunk_filenames
71
+
72
+
73
+ # Hugging Face
74
+ # Load the model and processor
75
+ processor = AutoProcessor.from_pretrained("GroupSix/whisper-small-sv")
76
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("GroupSix/whisper-small-sv")
77
+
78
+
79
+ def transcribe_audio(segment, num_segments):
80
+ print(f"Current segment: {segment} (out of {num_segments})")
81
+
82
+ # Load the audio file
83
+ waveform, sample_rate = torchaudio.load(segment)
84
+
85
+ # Resample if necessary
86
+ if sample_rate != 16000:
87
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
88
+ waveform = resampler(waveform)
89
+
90
+ # Run the model
91
+ inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
92
+
93
+ # Generate the transcription
94
+ with torch.no_grad():
95
+ generated_ids = model.generate(**inputs)
96
+
97
+ # Decode the output and select the first transcription
98
+ decoded_output = processor.batch_decode(generated_ids, skip_special_tokens=True)
99
+ transcription = decoded_output[0] if decoded_output else ""
100
+
101
+ return transcription
102
+
103
+
104
+ # Main function to tie everything together
105
+ def process_video(url):
106
+ # Download and split the audio
107
+ audio_file = download_video(url)
108
+ segments = split_audio(audio_file)
109
+
110
+ # Transcribe each segment
111
+ transcriptions = [transcribe_audio(segment, len(segments)) for segment in segments]
112
+
113
+ # Delete the audio file and the chunks
114
+ os.remove(audio_file)
115
+ for segment in segments:
116
+ os.remove(segment)
117
+
118
+ return transcriptions
119
+
120
+
121
+ # Gradio interface
122
+ iface = gr.Interface(
123
+ fn=process_video,
124
+ inputs=gr.Textbox(label="Swedish YouTube Video URL"),
125
+ outputs=gr.Textbox(label="Transcriptions"),
126
+ examples=[
127
+ ["https://www.youtube.com/watch?v=hcxwTgEC7IM"], # Fred på jorden
128
+ ["https://www.youtube.com/watch?v=AzlipxrzMe4"], # Jerry talar spanska
129
+ ["https://www.youtube.com/watch?v=H_16_5kGh3I"], # Det heter näsa, inte nos!
130
+ ["https://www.youtube.com/watch?v=v2m4V6FUseQ"], # Ove blir arg på pantsystemet
131
+ ["https://www.youtube.com/watch?v=oA5QJHBNQkU"], # Hur mår björnen egentligen?
132
+ ]
133
+ )
134
+
135
+ iface.launch()
requirements.txt CHANGED
@@ -1,2 +1,6 @@
1
  tensorflow
2
- transformers==4.35.2
 
 
 
 
 
1
  tensorflow
2
+ torch==2.1.1
3
+ torchaudio==2.1.1
4
+ torchvision==0.16.1
5
+ transformers==4.35.2
6
+ yt-dlp==2023.11.16