Spaces:
Sleeping
Sleeping
Cristian Tatu
commited on
Commit
Β·
1e50356
1
Parent(s):
179fb16
v1
Browse files- .gitattributes +8 -0
- README.md +5 -4
- female_1.wav +3 -0
- female_2.wav +3 -0
- female_3.wav +3 -0
- female_4.wav +3 -0
- main.py +13 -0
- male_1.wav +3 -0
- male_2.wav +3 -0
- male_3.wav +3 -0
- male_4.wav +3 -0
- model.onnx +3 -0
- model.py +64 -0
- requirements.txt +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
male_1.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
male_2.wav filter=lfs diff=lfs merge=lfs -text
|
38 |
+
male_3.wav filter=lfs diff=lfs merge=lfs -text
|
39 |
+
male_4.wav filter=lfs diff=lfs merge=lfs -text
|
40 |
+
female_1.wav filter=lfs diff=lfs merge=lfs -text
|
41 |
+
female_2.wav filter=lfs diff=lfs merge=lfs -text
|
42 |
+
female_3.wav filter=lfs diff=lfs merge=lfs -text
|
43 |
+
female_4.wav filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
---
|
2 |
title: Cnn Voice Classifier
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.35.0
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Cnn Voice Classifier
|
3 |
+
emoji: π§βπΌπ€΅ββοΈ
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.35.0
|
8 |
+
app_file: main.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
+
short_description: Voice gender classification model trained with many datasets
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
female_1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:690abe805180c50c0784f8721f7577707fac609566fe789adf6573f28d627a38
|
3 |
+
size 331050
|
female_2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:23367a63c4966046aa8626dabb486cda8dcdd5554f01d74c6fa08179efd1078b
|
3 |
+
size 67480
|
female_3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:088ef297b02fbbb7f5606364b806cc4ac5dbf1ad0f6a7736584ca3b67cf34ca1
|
3 |
+
size 330828
|
female_4.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10d6dc49ad7bfa6e5da8f6203564e358fb367bc71215ab27a7037a4014147972
|
3 |
+
size 330828
|
main.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from model import CnnVoiceClassifier
|
3 |
+
|
4 |
+
model = CnnVoiceClassifier()
|
5 |
+
|
6 |
+
audio_component = gr.Audio(type='filepath', label='Upload your audio file here')
|
7 |
+
label_component = gr.Label(label='Gender classification result')
|
8 |
+
|
9 |
+
sample_female = [f'female_{i}.wav' for i in range(1, 5)]
|
10 |
+
sample_male = [f'male_{i}.wav' for i in range(1, 5)]
|
11 |
+
|
12 |
+
demo = gr.Interface(fn=model.inference, inputs=audio_component, outputs=label_component, examples=sample_female + sample_male)
|
13 |
+
demo.launch()
|
male_1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0b3e3539feeb7acddd0648e49529d5dcd017da380abe06eb898d13ecbf8a7ad4
|
3 |
+
size 330828
|
male_2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e4cad1f12d071951045f763b89e68166617a5187ee66676bfc007993c5268db
|
3 |
+
size 330828
|
male_3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd2a730eb06133800165c03ec672da271430d236fa547dc71ec8270c17d0d6c7
|
3 |
+
size 330828
|
male_4.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd70dc5223a1acf0073eb3ab589c9699f0cabc1e4f3bd77d030054943846dd10
|
3 |
+
size 330828
|
model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2a828584699ca3b798aa47fa0e95447637e6607bf703b3cfde8cccc25974c46
|
3 |
+
size 4609037
|
model.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import onnxruntime as ort
|
2 |
+
import librosa
|
3 |
+
import numpy as np
|
4 |
+
import scipy
|
5 |
+
|
6 |
+
SR = 22050
|
7 |
+
LENGTH_SEC = 1.5
|
8 |
+
|
9 |
+
def resample(audio_data, original_sr, target_sr):
|
10 |
+
num_samples = int(len(audio_data) * target_sr / original_sr)
|
11 |
+
return scipy.signal.resample(audio_data, num_samples)
|
12 |
+
|
13 |
+
|
14 |
+
def load_audio_slices(af):
|
15 |
+
audio, sr = librosa.load(af, sr=None)
|
16 |
+
if sr != SR:
|
17 |
+
audio = resample(audio, sr, SR)
|
18 |
+
|
19 |
+
LENGTH_SAMPLES = int(LENGTH_SEC * SR)
|
20 |
+
|
21 |
+
slices = []
|
22 |
+
if len(audio) < LENGTH_SAMPLES:
|
23 |
+
padding_needed = LENGTH_SAMPLES - len(audio)
|
24 |
+
audio = np.pad(audio, (0, padding_needed), mode='constant')
|
25 |
+
slices.append(audio)
|
26 |
+
else:
|
27 |
+
num_chunks = len(audio) / LENGTH_SAMPLES
|
28 |
+
|
29 |
+
if num_chunks > 2:
|
30 |
+
num_chunks = 5
|
31 |
+
elif num_chunks > 1.5:
|
32 |
+
num_chunks = 3
|
33 |
+
elif num_chunks > 1:
|
34 |
+
num_chunks = 2
|
35 |
+
|
36 |
+
end = len(audio) - LENGTH_SAMPLES
|
37 |
+
idxs_split = np.arange(0, end, end // num_chunks, dtype=int)
|
38 |
+
for idx_split in idxs_split:
|
39 |
+
sl = slice(idx_split, idx_split+LENGTH_SAMPLES)
|
40 |
+
slices.append(audio[sl])
|
41 |
+
|
42 |
+
slices = np.vstack(slices)
|
43 |
+
|
44 |
+
return slices / np.max(slices, axis=1)[:, np.newaxis]
|
45 |
+
|
46 |
+
class CnnVoiceClassifier:
|
47 |
+
def __init__(self):
|
48 |
+
self.session = ort.InferenceSession('model.onnx')
|
49 |
+
self.input_name = self.session.get_inputs()[0].name
|
50 |
+
self.output_name = self.session.get_outputs()[0].name
|
51 |
+
|
52 |
+
def inference(self, audio_path):
|
53 |
+
audio = load_audio_slices(audio_path)
|
54 |
+
input_feed = {self.input_name: np.expand_dims(audio, axis=-1)}
|
55 |
+
outputs = self.session.run([self.output_name], input_feed)
|
56 |
+
probs = outputs[0].flatten()
|
57 |
+
w = np.abs((probs - 0.5)*2)
|
58 |
+
final_prob = np.average(probs, weights=w)
|
59 |
+
|
60 |
+
return {'Male': final_prob, 'Female': 1-final_prob}
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
huggingface_hub
|
2 |
+
librosa
|
3 |
+
onnxruntime
|