space-sue commited on
Commit
9310327
·
1 Parent(s): 541d3dc

initial commit

Browse files
Files changed (7) hide show
  1. README.md +5 -7
  2. app.py +75 -0
  3. face_emotion_detection.py +124 -0
  4. facial_analysis.py +334 -0
  5. packages.txt +1 -0
  6. requirements.txt +6 -0
  7. vid_to_wav.py +17 -0
README.md CHANGED
@@ -1,13 +1,11 @@
1
  ---
2
- title: Hf Speech Eval
3
- emoji: 🌖
4
- colorFrom: indigo
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 3.24.1
8
  app_file: app.py
9
  pinned: false
10
- license: cc-by-nc-sa-4.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Speech Evaluation
3
+ emoji: 💬
4
+ colorFrom: gray
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 3.23.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
 
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import torch.cuda
4
+ import whisper
5
+ from whisper.tokenizer import LANGUAGES
6
+ from vid_to_wav import extract_audio
7
+ from face_emotion_detection import process_video
8
+ gpu = torch.cuda.is_available()
9
+ model = None
10
+
11
+
12
+ def analyze_transcription(text, duration):
13
+ word_count = len(text.split())
14
+ analysis_text = "The video is {} sec. long and the speaker speaks {} words.".format(
15
+ duration, word_count)
16
+ duration_in_min = duration/60
17
+ words_per_min = round(word_count /duration_in_min)
18
+ analysis_text = analysis_text + "The speech speed is {} words-per-minute".format(words_per_min)
19
+ if words_per_min < 130:
20
+ analysis_text = analysis_text + "The speaker has spoken slowly that average speakers"
21
+ elif words_per_min > 150:
22
+ analysis_text = analysis_text + "The speaker has spoken faster that average speakers"
23
+ else:
24
+ analysis_text = analysis_text + "The speaker maintains normal speed during speech making the speech comprehensible to most audiences!"
25
+ return analysis_text
26
+
27
+
28
+ def transcribe(filepath, language, task):
29
+ print(filepath)
30
+ video = process_video(filepath)
31
+ audio, audio_file, duration = extract_audio(filepath)
32
+ print(type)
33
+ language = None if language == "Detect" else language
34
+ text = model.transcribe(
35
+ audio_file, task=task.lower(), language=language, fp16=gpu,
36
+ )["text"].strip()
37
+ return video, text, analyze_transcription(text, duration)
38
+
39
+
40
+ def get_interface(model_name="medium"):
41
+ global model
42
+ model = whisper.load_model(model_name)
43
+
44
+ return gr.Interface(
45
+ fn=transcribe,
46
+ inputs=[
47
+ # gr.Audio(label="Record", source="microphone", type="filepath"),
48
+ gr.Video(label="Upload", source="upload", type="filepath"),
49
+ gr.Dropdown(
50
+ label="Language",
51
+ choices=["Detect"] + sorted([i.title()
52
+ for i in LANGUAGES.values()]),
53
+ value="Detect",
54
+ ),
55
+ gr.Dropdown(
56
+ label="Task",
57
+ choices=["Transcribe", "Translate"],
58
+ value="Transcribe",
59
+ info="Whether to perform X->X speech recognition or X->English translation",
60
+ ),
61
+ ],
62
+ outputs=[gr.Video(label="Emotion Analysis"),
63
+ gr.Textbox(label="Transcription", lines=26),
64
+ gr.Textbox(label="Speech Analysis", lines=4)],
65
+ # theme=gr.themes.Default(),
66
+ theme=gr.themes.Glass(
67
+ primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.purple),
68
+ title="Whisper is listening to you",
69
+ # description=DESCRIPTION,
70
+ allow_flagging="never",
71
+ )
72
+
73
+
74
+ demo = get_interface()
75
+ demo.queue().launch(debug=True)
face_emotion_detection.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import datetime
3
+ import os
4
+ import json
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ import cv2
8
+ from PIL import Image
9
+ import tensorflow as tf
10
+ from tensorflow.keras.models import Model, Sequential, load_model, model_from_json
11
+ from tensorflow.compat.v1.keras.backend import set_session
12
+ from facial_analysis import FacialImageProcessing
13
+
14
+ class NpEncoder(json.JSONEncoder):
15
+ def default(self, obj):
16
+ if isinstance(obj, np.integer):
17
+ return int(obj)
18
+ if isinstance(obj, np.floating):
19
+ return float(obj)
20
+ if isinstance(obj, np.ndarray):
21
+ return obj.tolist()
22
+ return super(NpEncoder, self).default(obj)
23
+
24
+ def initialize():
25
+ config = tf.compat.v1.ConfigProto()
26
+ config.gpu_options.allow_growth = True
27
+ sess = tf.compat.v1.Session(config=config)
28
+ set_session(sess)
29
+
30
+
31
+ def mobilenet_preprocess_input(x, **kwargs):
32
+ x[..., 0] -= 103.939
33
+ x[..., 1] -= 116.779
34
+ x[..., 2] -= 123.68
35
+ return x
36
+
37
+
38
+ def detect_emotion(frame_bgr):
39
+ imgProcessing = FacialImageProcessing(False)
40
+ model = load_model('./models/affectnet_emotions/mobilenet_7.h5')
41
+ # print(model.summary())
42
+ preprocessing_function = mobilenet_preprocess_input
43
+ INPUT_SIZE = (224, 224)
44
+ idx_to_class = {0: 'Anger', 1: 'Disgust', 2: 'Fear',
45
+ 3: 'Happiness', 4: 'Neutral', 5: 'Sadness', 6: 'Surprise'}
46
+
47
+ frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
48
+ bounding_boxes, points = imgProcessing.detect_faces(frame)
49
+ points = points.T
50
+ detections = {"id": str(datetime.datetime.now())}
51
+
52
+ for bbox, p in zip(bounding_boxes, points):
53
+ face_pred = {}
54
+ box = bbox.astype(np.int)
55
+ x1, y1, x2, y2 = box[0:4]
56
+ face_img = frame[y1:y2, x1:x2, :]
57
+ try:
58
+ face_img = cv2.resize(face_img, INPUT_SIZE)
59
+ except:
60
+ break
61
+ inp = face_img.astype(np.float32)
62
+ inp[..., 0] -= 103.939
63
+ inp[..., 1] -= 116.779
64
+ inp[..., 2] -= 123.68
65
+ inp = np.expand_dims(inp, axis=0)
66
+ scores = model.predict(inp)[0]
67
+ frame = cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 9, 12), 4)
68
+ cv2.putText(frame, idx_to_class[np.argmax(scores)] + ' ' + str(scores[np.argmax(
69
+ scores)]), (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
70
+ face_pred["face_bbox"] = [x1,y1,x2,y2]
71
+ face_pred["emotion_predicted"] = idx_to_class[np.argmax(scores)]
72
+ all_scores = {}
73
+ for i in range(len(scores)):
74
+ all_scores[str(idx_to_class[i])] = scores[i]
75
+ face_pred["scores"] = all_scores
76
+
77
+ detections["face"] = face_pred
78
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
79
+ print(detections)
80
+ return frame, detections
81
+
82
+
83
+ def process_video(video):
84
+ basename = os.path.basename(video)
85
+ name_only = os.path.splitext(basename)[0]
86
+ video_outputpath = os.path.join('./output',basename)
87
+ json_outputpath = os.path.join('./output',name_only + '.json')
88
+
89
+ # Writing to sample.json
90
+ with open(json_outputpath, "w") as jsonfile:
91
+ videocap = cv2.VideoCapture(video) # fpath)
92
+ ret, frame = videocap.read()
93
+ fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
94
+ fps = 24.0
95
+ size = (frame.shape[1], frame.shape[0])
96
+ out = cv2.VideoWriter(video_outputpath, fourcc, fps, size)
97
+ # for i in range(len(image_array)):
98
+ # out.write(image_array[i])
99
+ max_frame = 500
100
+ cnt = 0
101
+ while ret == True and cnt < 50:
102
+ processed_frame, detections = detect_emotion(frame)
103
+ json_object = json.dumps(detections, indent=4, cls=NpEncoder)
104
+ jsonfile.write(json_object)
105
+ cv2.imshow('img', np.array(processed_frame, dtype=np.uint8))
106
+ out.write(processed_frame)
107
+ ret, frame = videocap.read()
108
+ cv2.waitKey(1)
109
+ cnt += 1
110
+ videocap.release()
111
+ cv2.destroyAllWindows()
112
+ return out
113
+
114
+
115
+ def main():
116
+ parser = argparse.ArgumentParser(description='Analysis of Video')
117
+ parser.add_argument(
118
+ '-v', '--video', help='Video to be analysed', required=True)
119
+ args = parser.parse_args()
120
+ process_video(args.video)
121
+
122
+
123
+ if __name__ == '__main__':
124
+ main()
facial_analysis.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Reduced version of file https://github.com/HSE-asavchenko/HSE_FaceRec_tf/blob/master/age_gender_identity/facial_analysis.py
2
+ from __future__ import absolute_import
3
+ from __future__ import division
4
+ from __future__ import print_function
5
+
6
+ import sys
7
+ import os
8
+ #os.environ['CUDA_VISIBLE_DEVICES'] = ''
9
+ import argparse
10
+ import tensorflow as tf
11
+ import numpy as np
12
+ import cv2
13
+ import time
14
+
15
+ import subprocess, re
16
+
17
+
18
+ def is_specialfile(path,exts):
19
+ _, file_extension = os.path.splitext(path)
20
+ return file_extension.lower() in exts
21
+
22
+ img_extensions=['.jpg','.jpeg','.png']
23
+ def is_image(path):
24
+ return is_specialfile(path,img_extensions)
25
+
26
+ video_extensions=['.mov','.avi']
27
+ def is_video(path):
28
+ return is_specialfile(path,video_extensions)
29
+
30
+ class FacialImageProcessing:
31
+ # minsize: minimum of faces' size
32
+ def __init__(self, print_stat=False, minsize = 32):
33
+ self.print_stat=print_stat
34
+ self.minsize=minsize
35
+
36
+ models_path,_ = os.path.split(os.path.realpath(__file__))
37
+ models_path=os.path.join(models_path,'models','face_detection')
38
+ model_files={os.path.join(models_path,'mtcnn.pb'):''}
39
+
40
+ with tf.Graph().as_default() as full_graph:
41
+ for model_file in model_files:
42
+ tf.import_graph_def(FacialImageProcessing.load_graph_def(model_file), name=model_files[model_file])
43
+ self.sess=tf.compat.v1.Session(graph=full_graph)#,config=tf.ConfigProto(device_count={'CPU':1,'GPU':0}))
44
+ self.pnet, self.rnet, self.onet = FacialImageProcessing.load_mtcnn(self.sess,full_graph)
45
+
46
+ def close(self):
47
+ self.sess.close()
48
+
49
+ @staticmethod
50
+ def load_graph_def(frozen_graph_filename):
51
+ graph_def=None
52
+ with tf.io.gfile.GFile(frozen_graph_filename, 'rb') as f:
53
+ graph_def = tf.compat.v1.GraphDef()
54
+ graph_def.ParseFromString(f.read())
55
+ return graph_def
56
+
57
+ @staticmethod
58
+ def load_graph(frozen_graph_filename, prefix=''):
59
+ graph_def = FacialImageProcessing.load_graph_def(frozen_graph_filename)
60
+ with tf.Graph().as_default() as graph:
61
+ tf.import_graph_def(graph_def, name=prefix)
62
+ return graph
63
+
64
+ @staticmethod
65
+ def load_mtcnn(sess,graph):
66
+ pnet_out_1=graph.get_tensor_by_name('pnet/conv4-2/BiasAdd:0')
67
+ pnet_out_2=graph.get_tensor_by_name('pnet/prob1:0')
68
+ pnet_in=graph.get_tensor_by_name('pnet/input:0')
69
+
70
+ rnet_out_1=graph.get_tensor_by_name('rnet/conv5-2/conv5-2:0')
71
+ rnet_out_2=graph.get_tensor_by_name('rnet/prob1:0')
72
+ rnet_in=graph.get_tensor_by_name('rnet/input:0')
73
+
74
+ onet_out_1=graph.get_tensor_by_name('onet/conv6-2/conv6-2:0')
75
+ onet_out_2=graph.get_tensor_by_name('onet/conv6-3/conv6-3:0')
76
+ onet_out_3=graph.get_tensor_by_name('onet/prob1:0')
77
+ onet_in=graph.get_tensor_by_name('onet/input:0')
78
+
79
+ pnet_fun = lambda img : sess.run((pnet_out_1, pnet_out_2), feed_dict={pnet_in:img})
80
+ rnet_fun = lambda img : sess.run((rnet_out_1, rnet_out_2), feed_dict={rnet_in:img})
81
+ onet_fun = lambda img : sess.run((onet_out_1, onet_out_2, onet_out_3), feed_dict={onet_in:img})
82
+ return pnet_fun, rnet_fun, onet_fun
83
+
84
+ @staticmethod
85
+ def bbreg(boundingbox,reg):
86
+ # calibrate bounding boxes
87
+ if reg.shape[1]==1:
88
+ reg = np.reshape(reg, (reg.shape[2], reg.shape[3]))
89
+
90
+ w = boundingbox[:,2]-boundingbox[:,0]+1
91
+ h = boundingbox[:,3]-boundingbox[:,1]+1
92
+ b1 = boundingbox[:,0]+reg[:,0]*w
93
+ b2 = boundingbox[:,1]+reg[:,1]*h
94
+ b3 = boundingbox[:,2]+reg[:,2]*w
95
+ b4 = boundingbox[:,3]+reg[:,3]*h
96
+ boundingbox[:,0:4] = np.transpose(np.vstack([b1, b2, b3, b4 ]))
97
+ return boundingbox
98
+
99
+ @staticmethod
100
+ def generateBoundingBox(imap, reg, scale, t):
101
+ # use heatmap to generate bounding boxes
102
+ stride=2
103
+ cellsize=12
104
+
105
+ imap = np.transpose(imap)
106
+ dx1 = np.transpose(reg[:,:,0])
107
+ dy1 = np.transpose(reg[:,:,1])
108
+ dx2 = np.transpose(reg[:,:,2])
109
+ dy2 = np.transpose(reg[:,:,3])
110
+ y, x = np.where(imap >= t)
111
+ if y.shape[0]==1:
112
+ dx1 = np.flipud(dx1)
113
+ dy1 = np.flipud(dy1)
114
+ dx2 = np.flipud(dx2)
115
+ dy2 = np.flipud(dy2)
116
+ score = imap[(y,x)]
117
+ reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ]))
118
+ if reg.size==0:
119
+ reg = np.empty((0,3))
120
+ bb = np.transpose(np.vstack([y,x]))
121
+ q1 = np.fix((stride*bb+1)/scale)
122
+ q2 = np.fix((stride*bb+cellsize-1+1)/scale)
123
+ boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg])
124
+ return boundingbox, reg
125
+
126
+ # function pick = nms(boxes,threshold,type)
127
+ @staticmethod
128
+ def nms(boxes, threshold, method):
129
+ if boxes.size==0:
130
+ return np.empty((0,3))
131
+ x1 = boxes[:,0]
132
+ y1 = boxes[:,1]
133
+ x2 = boxes[:,2]
134
+ y2 = boxes[:,3]
135
+ s = boxes[:,4]
136
+ area = (x2-x1+1) * (y2-y1+1)
137
+ I = np.argsort(s)
138
+ pick = np.zeros_like(s, dtype=np.int16)
139
+ counter = 0
140
+ while I.size>0:
141
+ i = I[-1]
142
+ pick[counter] = i
143
+ counter += 1
144
+ idx = I[0:-1]
145
+ xx1 = np.maximum(x1[i], x1[idx])
146
+ yy1 = np.maximum(y1[i], y1[idx])
147
+ xx2 = np.minimum(x2[i], x2[idx])
148
+ yy2 = np.minimum(y2[i], y2[idx])
149
+ w = np.maximum(0.0, xx2-xx1+1)
150
+ h = np.maximum(0.0, yy2-yy1+1)
151
+ inter = w * h
152
+ if method == 'Min':
153
+ o = inter / np.minimum(area[i], area[idx])
154
+ else:
155
+ o = inter / (area[i] + area[idx] - inter)
156
+ I = I[np.where(o<=threshold)]
157
+ pick = pick[0:counter]
158
+ return pick
159
+
160
+ # function [dy edy dx edx y ey x ex tmpw tmph] = pad(total_boxes,w,h)
161
+ @staticmethod
162
+ def pad(total_boxes, w, h):
163
+ # compute the padding coordinates (pad the bounding boxes to square)
164
+ tmpw = (total_boxes[:,2]-total_boxes[:,0]+1).astype(np.int32)
165
+ tmph = (total_boxes[:,3]-total_boxes[:,1]+1).astype(np.int32)
166
+ numbox = total_boxes.shape[0]
167
+
168
+ dx = np.ones((numbox), dtype=np.int32)
169
+ dy = np.ones((numbox), dtype=np.int32)
170
+ edx = tmpw.copy().astype(np.int32)
171
+ edy = tmph.copy().astype(np.int32)
172
+
173
+ x = total_boxes[:,0].copy().astype(np.int32)
174
+ y = total_boxes[:,1].copy().astype(np.int32)
175
+ ex = total_boxes[:,2].copy().astype(np.int32)
176
+ ey = total_boxes[:,3].copy().astype(np.int32)
177
+
178
+ tmp = np.where(ex>w)
179
+ edx.flat[tmp] = np.expand_dims(-ex[tmp]+w+tmpw[tmp],1)
180
+ ex[tmp] = w
181
+
182
+ tmp = np.where(ey>h)
183
+ edy.flat[tmp] = np.expand_dims(-ey[tmp]+h+tmph[tmp],1)
184
+ ey[tmp] = h
185
+
186
+ tmp = np.where(x<1)
187
+ dx.flat[tmp] = np.expand_dims(2-x[tmp],1)
188
+ x[tmp] = 1
189
+
190
+ tmp = np.where(y<1)
191
+ dy.flat[tmp] = np.expand_dims(2-y[tmp],1)
192
+ y[tmp] = 1
193
+
194
+ return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph
195
+
196
+ # function [bboxA] = rerec(bboxA)
197
+ @staticmethod
198
+ def rerec(bboxA):
199
+ # convert bboxA to square
200
+ h = bboxA[:,3]-bboxA[:,1]
201
+ w = bboxA[:,2]-bboxA[:,0]
202
+ l = np.maximum(w, h)
203
+ bboxA[:,0] = bboxA[:,0]+w*0.5-l*0.5
204
+ bboxA[:,1] = bboxA[:,1]+h*0.5-l*0.5
205
+ bboxA[:,2:4] = bboxA[:,0:2] + np.transpose(np.tile(l,(2,1)))
206
+ return bboxA
207
+
208
+ def detect_faces(self,img):
209
+ # im: input image
210
+ # threshold: threshold=[th1 th2 th3], th1-3 are three steps's threshold
211
+ threshold = [ 0.6, 0.7, 0.9 ] # three steps's threshold
212
+ # fastresize: resize img from last scale (using in high-resolution images) if fastresize==true
213
+ factor = 0.709 # scale factor
214
+ factor_count=0
215
+ total_boxes=np.empty((0,9))
216
+ points=np.array([])
217
+ h=img.shape[0]
218
+ w=img.shape[1]
219
+ minl=np.amin([h, w])
220
+ m=12.0/self.minsize
221
+ minl=minl*m
222
+ # creat scale pyramid
223
+ scales=[]
224
+ while minl>=12:
225
+ scales += [m*np.power(factor, factor_count)]
226
+ minl = minl*factor
227
+ factor_count += 1
228
+
229
+ # first stage
230
+ #t=time.time()
231
+ for j in range(len(scales)):
232
+ scale=scales[j]
233
+ hs=int(np.ceil(h*scale))
234
+ ws=int(np.ceil(w*scale))
235
+ im_data = cv2.resize(img, (ws,hs), interpolation=cv2.INTER_AREA)
236
+ im_data = (im_data-127.5)*0.0078125
237
+ img_x = np.expand_dims(im_data, 0)
238
+ img_y = np.transpose(img_x, (0,2,1,3))
239
+ out = self.pnet(img_y)
240
+ out0 = np.transpose(out[0], (0,2,1,3))
241
+ out1 = np.transpose(out[1], (0,2,1,3))
242
+
243
+ boxes, _ = FacialImageProcessing.generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0])
244
+
245
+ # inter-scale nms
246
+ pick = FacialImageProcessing.nms(boxes.copy(), 0.5, 'Union')
247
+ if boxes.size>0 and pick.size>0:
248
+ boxes = boxes[pick,:]
249
+ total_boxes = np.append(total_boxes, boxes, axis=0)
250
+ numbox = total_boxes.shape[0]
251
+ #elapsed = time.time() - t
252
+ #print('1 phase nb=%d elapsed=%f'%(numbox,elapsed))
253
+ if numbox>0:
254
+ pick = FacialImageProcessing.nms(total_boxes.copy(), 0.7, 'Union')
255
+ total_boxes = total_boxes[pick,:]
256
+ regw = total_boxes[:,2]-total_boxes[:,0]
257
+ regh = total_boxes[:,3]-total_boxes[:,1]
258
+ qq1 = total_boxes[:,0]+total_boxes[:,5]*regw
259
+ qq2 = total_boxes[:,1]+total_boxes[:,6]*regh
260
+ qq3 = total_boxes[:,2]+total_boxes[:,7]*regw
261
+ qq4 = total_boxes[:,3]+total_boxes[:,8]*regh
262
+ total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]]))
263
+ total_boxes = FacialImageProcessing.rerec(total_boxes.copy())
264
+ total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32)
265
+ dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = FacialImageProcessing.pad(total_boxes.copy(), w, h)
266
+
267
+ numbox = total_boxes.shape[0]
268
+ #elapsed = time.time() - t
269
+ #print('2 phase nb=%d elapsed=%f'%(numbox,elapsed))
270
+ if numbox>0:
271
+ # second stage
272
+ tempimg = np.zeros((24,24,3,numbox))
273
+ for k in range(0,numbox):
274
+ tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
275
+ tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
276
+ if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
277
+ tempimg[:,:,:,k] = cv2.resize(tmp, (24,24), interpolation=cv2.INTER_AREA)
278
+ else:
279
+ return np.empty()
280
+ tempimg = (tempimg-127.5)*0.0078125
281
+ tempimg1 = np.transpose(tempimg, (3,1,0,2))
282
+ out = self.rnet(tempimg1)
283
+ out0 = np.transpose(out[0])
284
+ out1 = np.transpose(out[1])
285
+ score = out1[1,:]
286
+ ipass = np.where(score>threshold[1])
287
+ total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
288
+ mv = out0[:,ipass[0]]
289
+ if total_boxes.shape[0]>0:
290
+ pick = FacialImageProcessing.nms(total_boxes, 0.7, 'Union')
291
+ total_boxes = total_boxes[pick,:]
292
+ total_boxes = FacialImageProcessing.bbreg(total_boxes.copy(), np.transpose(mv[:,pick]))
293
+ total_boxes = FacialImageProcessing.rerec(total_boxes.copy())
294
+
295
+ numbox = total_boxes.shape[0]
296
+ #elapsed = time.time() - t
297
+ #print('3 phase nb=%d elapsed=%f'%(numbox,elapsed))
298
+ if numbox>0:
299
+ # third stage
300
+ total_boxes = np.fix(total_boxes).astype(np.int32)
301
+ dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = FacialImageProcessing.pad(total_boxes.copy(), w, h)
302
+ tempimg = np.zeros((48,48,3,numbox))
303
+ for k in range(0,numbox):
304
+ tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
305
+ tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
306
+ if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
307
+ tempimg[:,:,:,k] = cv2.resize(tmp, (48,48), interpolation=cv2.INTER_AREA)
308
+ else:
309
+ return np.empty()
310
+ tempimg = (tempimg-127.5)*0.0078125
311
+ tempimg1 = np.transpose(tempimg, (3,1,0,2))
312
+ out = self.onet(tempimg1)
313
+ out0 = np.transpose(out[0])
314
+ out1 = np.transpose(out[1])
315
+ out2 = np.transpose(out[2])
316
+ score = out2[1,:]
317
+ points = out1
318
+ ipass = np.where(score>threshold[2])
319
+ points = points[:,ipass[0]]
320
+ total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
321
+ mv = out0[:,ipass[0]]
322
+
323
+ w = total_boxes[:,2]-total_boxes[:,0]+1
324
+ h = total_boxes[:,3]-total_boxes[:,1]+1
325
+ points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1
326
+ points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1
327
+ if total_boxes.shape[0]>0:
328
+ total_boxes = FacialImageProcessing.bbreg(total_boxes.copy(), np.transpose(mv))
329
+ pick = FacialImageProcessing.nms(total_boxes.copy(), 0.7, 'Min')
330
+ total_boxes = total_boxes[pick,:]
331
+ points = points[:,pick]
332
+ #elapsed = time.time() - t
333
+ #print('4 phase elapsed=%f'%(elapsed))
334
+ return total_boxes, points
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ torchaudio
4
+ openai-whisper
5
+ gradio
6
+ moviepy
vid_to_wav.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import moviepy
2
+ import os
3
+ import glob
4
+ import moviepy.editor
5
+
6
+ def extract_audio(vid_filename):
7
+ video = moviepy.editor.VideoFileClip(vid_filename)
8
+ duration = video.duration
9
+
10
+ audio = video.audio
11
+ wav_file_name = ""
12
+ if audio is not None:
13
+ wav_file_name = vid_filename.replace('.mp4', '.wav') # Replace .mkv with .wav
14
+ audio.write_audiofile(wav_file_name)
15
+
16
+ return audio, wav_file_name, duration
17
+