Spaces:
Running
Running
abrar-adnan
commited on
Commit
·
197af76
1
Parent(s):
90452ba
added audio transcription
Browse files- app.py +31 -15
- requirements.txt +0 -0
app.py
CHANGED
@@ -6,8 +6,9 @@ from fastai.vision.all import load_learner
|
|
6 |
import time
|
7 |
import base64
|
8 |
from deepface import DeepFace
|
9 |
-
import torchaudio
|
10 |
-
import
|
|
|
11 |
|
12 |
# import pathlib
|
13 |
# temp = pathlib.PosixPath
|
@@ -22,6 +23,32 @@ backends = [
|
|
22 |
'mediapipe'
|
23 |
]
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
model = load_learner("gaze-recognizer-v3.pkl")
|
26 |
|
27 |
def video_processing(video_file, encoded_video):
|
@@ -45,19 +72,8 @@ def video_processing(video_file, encoded_video):
|
|
45 |
|
46 |
start_time = time.time()
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
waveform, sample_rate
|
51 |
-
|
52 |
-
waveform, sample_rate = torchaudio.load("audio.wav")
|
53 |
-
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
|
54 |
-
waveform = resampler(waveform)[0]
|
55 |
-
|
56 |
-
input_features = processor(waveform.squeeze(dim=0), return_tensors="pt").input_features
|
57 |
-
predicted_ids = model.generate(input_features)
|
58 |
-
|
59 |
-
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
60 |
-
print(transcription[0])
|
61 |
|
62 |
video_capture = cv2.VideoCapture(video_file)
|
63 |
on_camera = 0
|
|
|
6 |
import time
|
7 |
import base64
|
8 |
from deepface import DeepFace
|
9 |
+
import torchaudio
|
10 |
+
import moviepy.editor as mp
|
11 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
12 |
|
13 |
# import pathlib
|
14 |
# temp = pathlib.PosixPath
|
|
|
23 |
'mediapipe'
|
24 |
]
|
25 |
|
26 |
+
def getTranscription(path):
|
27 |
+
# Insert Local Video File Path
|
28 |
+
clip = mp.VideoFileClip(path)
|
29 |
+
|
30 |
+
# Insert Local Audio File Path
|
31 |
+
clip.audio.write_audiofile(r"audio.wav")
|
32 |
+
|
33 |
+
waveform, sample_rate = torchaudio.load("audio.wav")
|
34 |
+
waveform, sample_rate
|
35 |
+
|
36 |
+
waveform, sample_rate = torchaudio.load("audio.wav")
|
37 |
+
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
|
38 |
+
waveform = resampler(waveform)[0]
|
39 |
+
|
40 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
|
41 |
+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
|
42 |
+
model.config.forced_decoder_ids = None
|
43 |
+
|
44 |
+
input_features = processor(waveform.squeeze(dim=0), return_tensors="pt").input_features
|
45 |
+
predicted_ids = model.generate(input_features)
|
46 |
+
|
47 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
48 |
+
|
49 |
+
return transcription[0]
|
50 |
+
|
51 |
+
|
52 |
model = load_learner("gaze-recognizer-v3.pkl")
|
53 |
|
54 |
def video_processing(video_file, encoded_video):
|
|
|
72 |
|
73 |
start_time = time.time()
|
74 |
|
75 |
+
transcription = getTranscription(video_file)
|
76 |
+
print(transcription)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
video_capture = cv2.VideoCapture(video_file)
|
79 |
on_camera = 0
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|