rawanahmed commited on
Commit
f97e8dc
·
verified ·
1 Parent(s): ca2dbad

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +37 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datasets import load_dataset
3
+ import torch
4
+ from transformers import SpeechT5ForSpeechToText, SpeechT5Processor
5
+
6
+ # Load the English subset of the VoxPopuli dataset
7
+ dataset = load_dataset("VoxPopuli", "en")
8
+
9
+ # Example function to load audio and transcriptions
10
+ def get_sample(dataset):
11
+ # Get a random sample from the training set
12
+ sample = dataset['train'][0] # You can modify to pick a random sample or any sample index
13
+ audio_file = sample["audio"]["path"]
14
+ transcription = sample["sentence"]
15
+ return audio_file, transcription
16
+
17
+ # Initialize the SpeechT5 model and processor
18
+ processor = SpeechT5Processor.from_pretrained("facebook/speech_t5_base")
19
+ model = SpeechT5ForSpeechToText.from_pretrained("facebook/speech_t5_base")
20
+
21
+ # Example Gradio interface function
22
+ def transcribe(audio):
23
+ # Process the audio and get transcription
24
+ inputs = processor(audio, return_tensors="pt", sampling_rate=16000)
25
+ with torch.no_grad():
26
+ logits = model(**inputs).logits
27
+ transcription = processor.decode(logits[0], skip_special_tokens=True)
28
+ return transcription
29
+
30
+ # Load a sample to check if everything is set up
31
+ audio_file, transcription = get_sample(dataset)
32
+
33
+ # Set up Gradio interface
34
+ iface = gr.Interface(fn=transcribe, inputs=gr.Audio(source="upload", type="filepath"), outputs="text")
35
+
36
+ # Launch the interface
37
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers==4.30.0
2
+ datasets==2.15.0
3
+ torch==2.1.0
4
+ librosa==0.10.0
5
+ soundfile==0.10.3.post1