Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,31 +6,48 @@ from pyannote.audio import Pipeline
|
|
6 |
|
7 |
# instantiate the pipeline
|
8 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
pipeline = Pipeline.from_pretrained(
|
10 |
"pyannote/speaker-diarization-3.1",
|
11 |
-
use_auth_token=
|
|
|
12 |
)
|
13 |
-
|
14 |
-
|
|
|
15 |
pipeline.to(device)
|
|
|
|
|
16 |
except Exception as e:
|
17 |
print(f"Error initializing pipeline: {e}")
|
18 |
pipeline = None
|
19 |
|
20 |
-
|
21 |
def save_audio(audio):
|
22 |
if pipeline is None:
|
23 |
return "Error: Pipeline not initialized"
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
32 |
|
33 |
-
|
|
|
|
|
|
|
|
|
34 |
|
35 |
@spaces.GPU(duration=60 * 2)
|
36 |
def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
|
@@ -38,6 +55,7 @@ def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
|
|
38 |
return "Error: Pipeline not initialized"
|
39 |
|
40 |
try:
|
|
|
41 |
params = {}
|
42 |
if num_speakers > 0:
|
43 |
params["num_speakers"] = num_speakers
|
@@ -46,19 +64,25 @@ def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
|
|
46 |
if max_speakers > 0:
|
47 |
params["max_speakers"] = max_speakers
|
48 |
|
|
|
49 |
diarization = pipeline(temp_file, **params)
|
|
|
50 |
except Exception as e:
|
|
|
51 |
return f"Error processing audio: {e}"
|
52 |
|
53 |
-
#
|
54 |
-
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
# Return the diarization output
|
57 |
return str(diarization)
|
58 |
-
|
59 |
def timestamp_to_seconds(timestamp):
|
60 |
try:
|
61 |
-
#
|
62 |
h, m, s = map(float, timestamp.split(':'))
|
63 |
return 3600 * h + 60 * m + s
|
64 |
except ValueError as e:
|
@@ -66,7 +90,7 @@ def timestamp_to_seconds(timestamp):
|
|
66 |
return None
|
67 |
|
68 |
def generate_labels_from_diarization(diarization_output):
|
69 |
-
successful_lines = 0
|
70 |
labels_path = 'labels.txt'
|
71 |
try:
|
72 |
with open(labels_path, 'w') as outfile:
|
@@ -76,9 +100,11 @@ def generate_labels_from_diarization(diarization_output):
|
|
76 |
parts = line.strip()[1:-1].split(' --> ')
|
77 |
start_time = parts[0].strip()
|
78 |
end_time = parts[1].split(']')[0].strip()
|
79 |
-
label = line.split()[-1].strip() #
|
80 |
start_seconds = timestamp_to_seconds(start_time)
|
81 |
end_seconds = timestamp_to_seconds(end_time)
|
|
|
|
|
82 |
outfile.write(f"{start_seconds}\t{end_seconds}\t{label}\n")
|
83 |
successful_lines += 1
|
84 |
except Exception as e:
|
@@ -89,38 +115,29 @@ def generate_labels_from_diarization(diarization_output):
|
|
89 |
print(f"Cannot write to file '{labels_path}'. Error: {e}")
|
90 |
return None
|
91 |
|
92 |
-
|
93 |
-
|
94 |
def process_audio(audio, num_speakers, min_speakers, max_speakers):
|
95 |
-
|
|
|
|
|
|
|
|
|
96 |
if diarization_result.startswith("Error"):
|
97 |
-
return diarization_result, None
|
98 |
-
else:
|
99 |
-
label_file = generate_labels_from_diarization(diarization_result)
|
100 |
-
return diarization_result, label_file
|
101 |
|
|
|
|
|
|
|
|
|
102 |
with gr.Blocks() as demo:
|
103 |
gr.Markdown("""
|
104 |
# ๐ฃ๏ธPyannote Speaker Diarization 3.1๐ฃ๏ธ
|
105 |
-
|
106 |
This model takes an audio file as input and outputs the diarization of the speakers in the audio.
|
107 |
-
|
108 |
Please upload an audio file and adjust the parameters as needed.
|
109 |
|
110 |
-
The maximum length of the audio file that can be processed depends based on the hardware it's running on. If you are on the ZeroGPU HuggingFace Space, it's around **35-40 minutes**.
|
111 |
-
|
112 |
If you find this space helpful, please โค it.
|
113 |
-
|
114 |
-
Join my server for support and open source AI discussion: https://discord.gg/osai
|
115 |
-
|
116 |
-
IF YOU LEAVE ALL THE PARAMETERS BELOW TO 0, IT WILL BE ON AUTO MODE, AUTOMATICALLY DETECTING THE SPEAKERS, ELSE USE THE ONES BELOW FOR MORE COSTUMIZATION & BETTER RESULTS
|
117 |
-
|
118 |
""")
|
119 |
audio_input = gr.Audio(type="filepath", label="Upload Audio File")
|
120 |
-
num_speakers_input = gr.Number(label="Number of Speakers",
|
121 |
-
|
122 |
-
gr.Markdown("Use the following parameters only if you don't know the number of speakers, you can set lower and/or upper bounds on the number of speakers, if instead you know it, leave the following parameters to 0 and use the one above")
|
123 |
-
|
124 |
min_speakers_input = gr.Number(label="Minimum Number of Speakers", value=0)
|
125 |
max_speakers_input = gr.Number(label="Maximum Number of Speakers", value=0)
|
126 |
process_button = gr.Button("Process")
|
@@ -131,5 +148,6 @@ with gr.Blocks() as demo:
|
|
131 |
fn=process_audio,
|
132 |
inputs=[audio_input, num_speakers_input, min_speakers_input, max_speakers_input],
|
133 |
outputs=[diarization_output, label_file_link]
|
134 |
-
)
|
135 |
-
|
|
|
|
6 |
|
7 |
# instantiate the pipeline
|
8 |
try:
|
9 |
+
# ืืืืงื ืื ื-token ืงืืื ืืกืืืื ืืืื ืืื ืื ืจืืง
|
10 |
+
auth_token = os.environ.get("HUGGINGFACE_READ_TOKEN")
|
11 |
+
if not auth_token:
|
12 |
+
raise ValueError("HUGGINGFACE_READ_TOKEN not found or is empty")
|
13 |
+
|
14 |
+
print("Token found, attempting to initialize pipeline...")
|
15 |
+
|
16 |
+
# ื ืืกืืื ืืืชืื ืืช ื-Pipeline
|
17 |
pipeline = Pipeline.from_pretrained(
|
18 |
"pyannote/speaker-diarization-3.1",
|
19 |
+
use_auth_token=auth_token,
|
20 |
+
cache_dir="./cache" # ืืื ืื ืกืืช ืืืฉืชืืฉ ืืืืืื
|
21 |
)
|
22 |
+
|
23 |
+
# ืืขืืจืช ื-Pipeline ื-CPU ืืืื, ืืืชืืฉื ืฉืืชื ืืืจืกื ืืื ืืืช
|
24 |
+
device = torch.device("cpu")
|
25 |
pipeline.to(device)
|
26 |
+
print("Pipeline initialized successfully!")
|
27 |
+
|
28 |
except Exception as e:
|
29 |
print(f"Error initializing pipeline: {e}")
|
30 |
pipeline = None
|
31 |
|
|
|
32 |
def save_audio(audio):
|
33 |
if pipeline is None:
|
34 |
return "Error: Pipeline not initialized"
|
35 |
|
36 |
+
try:
|
37 |
+
# ืงืจืืื ืฉื ืงืืืฅ ืืืืืื ืฉืขืื
|
38 |
+
with open(audio, "rb") as f:
|
39 |
+
audio_data = f.read()
|
40 |
|
41 |
+
# ืฉืืืจื ืฉื ืงืืืฅ ืืืืืื ืืืืงืื ืืื ื
|
42 |
+
temp_file = "temp.wav"
|
43 |
+
with open(temp_file, "wb") as f:
|
44 |
+
f.write(audio_data)
|
45 |
|
46 |
+
print(f"Audio file saved to {temp_file}")
|
47 |
+
return temp_file
|
48 |
+
except Exception as e:
|
49 |
+
print(f"Error saving audio file: {e}")
|
50 |
+
return None
|
51 |
|
52 |
@spaces.GPU(duration=60 * 2)
|
53 |
def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
|
|
|
55 |
return "Error: Pipeline not initialized"
|
56 |
|
57 |
try:
|
58 |
+
# ืืื ืช ืคืจืืืจืื ืืคื ืืงืื ืฉื ืืืฉืชืืฉ
|
59 |
params = {}
|
60 |
if num_speakers > 0:
|
61 |
params["num_speakers"] = num_speakers
|
|
|
64 |
if max_speakers > 0:
|
65 |
params["max_speakers"] = max_speakers
|
66 |
|
67 |
+
print(f"Processing audio file {temp_file} with parameters: {params}")
|
68 |
diarization = pipeline(temp_file, **params)
|
69 |
+
print("Diarization completed successfully!")
|
70 |
except Exception as e:
|
71 |
+
print(f"Error processing audio: {e}")
|
72 |
return f"Error processing audio: {e}"
|
73 |
|
74 |
+
# ืืกืจืช ืืงืืืฅ ืืืื ื ืืืืจ ืขืืืื
|
75 |
+
try:
|
76 |
+
os.remove(temp_file)
|
77 |
+
print(f"Temporary file {temp_file} removed successfully.")
|
78 |
+
except Exception as e:
|
79 |
+
print(f"Error removing temporary file {temp_file}: {e}")
|
80 |
|
|
|
81 |
return str(diarization)
|
82 |
+
|
83 |
def timestamp_to_seconds(timestamp):
|
84 |
try:
|
85 |
+
# ืืืจื ืฉื timestamp ืืฉื ืืืช
|
86 |
h, m, s = map(float, timestamp.split(':'))
|
87 |
return 3600 * h + 60 * m + s
|
88 |
except ValueError as e:
|
|
|
90 |
return None
|
91 |
|
92 |
def generate_labels_from_diarization(diarization_output):
|
93 |
+
successful_lines = 0
|
94 |
labels_path = 'labels.txt'
|
95 |
try:
|
96 |
with open(labels_path, 'w') as outfile:
|
|
|
100 |
parts = line.strip()[1:-1].split(' --> ')
|
101 |
start_time = parts[0].strip()
|
102 |
end_time = parts[1].split(']')[0].strip()
|
103 |
+
label = line.split()[-1].strip() # ืืงืืืช ืืชืืืืช ืืืฉืืจื
|
104 |
start_seconds = timestamp_to_seconds(start_time)
|
105 |
end_seconds = timestamp_to_seconds(end_time)
|
106 |
+
if start_seconds is None or end_seconds is None:
|
107 |
+
continue
|
108 |
outfile.write(f"{start_seconds}\t{end_seconds}\t{label}\n")
|
109 |
successful_lines += 1
|
110 |
except Exception as e:
|
|
|
115 |
print(f"Cannot write to file '{labels_path}'. Error: {e}")
|
116 |
return None
|
117 |
|
|
|
|
|
118 |
def process_audio(audio, num_speakers, min_speakers, max_speakers):
|
119 |
+
temp_file = save_audio(audio)
|
120 |
+
if temp_file is None:
|
121 |
+
return "Error saving audio file", None
|
122 |
+
|
123 |
+
diarization_result = diarize_audio(temp_file, num_speakers, min_speakers, max_speakers)
|
124 |
if diarization_result.startswith("Error"):
|
125 |
+
return diarization_result, None
|
|
|
|
|
|
|
126 |
|
127 |
+
label_file = generate_labels_from_diarization(diarization_result)
|
128 |
+
return diarization_result, label_file
|
129 |
+
|
130 |
+
# ืืืฉืง ืืจืืืืื
|
131 |
with gr.Blocks() as demo:
|
132 |
gr.Markdown("""
|
133 |
# ๐ฃ๏ธPyannote Speaker Diarization 3.1๐ฃ๏ธ
|
|
|
134 |
This model takes an audio file as input and outputs the diarization of the speakers in the audio.
|
|
|
135 |
Please upload an audio file and adjust the parameters as needed.
|
136 |
|
|
|
|
|
137 |
If you find this space helpful, please โค it.
|
|
|
|
|
|
|
|
|
|
|
138 |
""")
|
139 |
audio_input = gr.Audio(type="filepath", label="Upload Audio File")
|
140 |
+
num_speakers_input = gr.Number(label="Number of Speakers", value=0)
|
|
|
|
|
|
|
141 |
min_speakers_input = gr.Number(label="Minimum Number of Speakers", value=0)
|
142 |
max_speakers_input = gr.Number(label="Maximum Number of Speakers", value=0)
|
143 |
process_button = gr.Button("Process")
|
|
|
148 |
fn=process_audio,
|
149 |
inputs=[audio_input, num_speakers_input, min_speakers_input, max_speakers_input],
|
150 |
outputs=[diarization_output, label_file_link]
|
151 |
+
)
|
152 |
+
|
153 |
+
demo.launch(share=False)
|