Nitzantry1 commited on
Commit
ff841ad
ยท
verified ยท
1 Parent(s): 1f97d8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -40
app.py CHANGED
@@ -6,31 +6,48 @@ from pyannote.audio import Pipeline
6
 
7
  # instantiate the pipeline
8
  try:
 
 
 
 
 
 
 
 
9
  pipeline = Pipeline.from_pretrained(
10
  "pyannote/speaker-diarization-3.1",
11
- use_auth_token=os.environ["HUGGINGFACE_READ_TOKEN"]
 
12
  )
13
- # Move the pipeline to the GPU
14
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
15
  pipeline.to(device)
 
 
16
  except Exception as e:
17
  print(f"Error initializing pipeline: {e}")
18
  pipeline = None
19
 
20
-
21
  def save_audio(audio):
22
  if pipeline is None:
23
  return "Error: Pipeline not initialized"
24
 
25
- # Read the uploaded audio file as bytes
26
- with open(audio, "rb") as f:
27
- audio_data = f.read()
 
28
 
29
- # Save the uploaded audio file to a temporary location
30
- with open("temp.wav", "wb") as f:
31
- f.write(audio_data)
 
32
 
33
- return "temp.wav"
 
 
 
 
34
 
35
  @spaces.GPU(duration=60 * 2)
36
  def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
@@ -38,6 +55,7 @@ def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
38
  return "Error: Pipeline not initialized"
39
 
40
  try:
 
41
  params = {}
42
  if num_speakers > 0:
43
  params["num_speakers"] = num_speakers
@@ -46,19 +64,25 @@ def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
46
  if max_speakers > 0:
47
  params["max_speakers"] = max_speakers
48
 
 
49
  diarization = pipeline(temp_file, **params)
 
50
  except Exception as e:
 
51
  return f"Error processing audio: {e}"
52
 
53
- # Remove the temporary file
54
- os.remove(temp_file)
 
 
 
 
55
 
56
- # Return the diarization output
57
  return str(diarization)
58
-
59
  def timestamp_to_seconds(timestamp):
60
  try:
61
- # Extracts hour, minute, and second from timestamp and converts to total seconds
62
  h, m, s = map(float, timestamp.split(':'))
63
  return 3600 * h + 60 * m + s
64
  except ValueError as e:
@@ -66,7 +90,7 @@ def timestamp_to_seconds(timestamp):
66
  return None
67
 
68
  def generate_labels_from_diarization(diarization_output):
69
- successful_lines = 0 # Counter for successfully processed lines
70
  labels_path = 'labels.txt'
71
  try:
72
  with open(labels_path, 'w') as outfile:
@@ -76,9 +100,11 @@ def generate_labels_from_diarization(diarization_output):
76
  parts = line.strip()[1:-1].split(' --> ')
77
  start_time = parts[0].strip()
78
  end_time = parts[1].split(']')[0].strip()
79
- label = line.split()[-1].strip() # Extracting the last word as label
80
  start_seconds = timestamp_to_seconds(start_time)
81
  end_seconds = timestamp_to_seconds(end_time)
 
 
82
  outfile.write(f"{start_seconds}\t{end_seconds}\t{label}\n")
83
  successful_lines += 1
84
  except Exception as e:
@@ -89,38 +115,29 @@ def generate_labels_from_diarization(diarization_output):
89
  print(f"Cannot write to file '{labels_path}'. Error: {e}")
90
  return None
91
 
92
-
93
-
94
  def process_audio(audio, num_speakers, min_speakers, max_speakers):
95
- diarization_result = diarize_audio(save_audio(audio), num_speakers, min_speakers, max_speakers)
 
 
 
 
96
  if diarization_result.startswith("Error"):
97
- return diarization_result, None # Return None for label file link if there's an error
98
- else:
99
- label_file = generate_labels_from_diarization(diarization_result)
100
- return diarization_result, label_file
101
 
 
 
 
 
102
  with gr.Blocks() as demo:
103
  gr.Markdown("""
104
  # ๐Ÿ—ฃ๏ธPyannote Speaker Diarization 3.1๐Ÿ—ฃ๏ธ
105
-
106
  This model takes an audio file as input and outputs the diarization of the speakers in the audio.
107
-
108
  Please upload an audio file and adjust the parameters as needed.
109
 
110
- The maximum length of the audio file that can be processed depends based on the hardware it's running on. If you are on the ZeroGPU HuggingFace Space, it's around **35-40 minutes**.
111
-
112
  If you find this space helpful, please โค it.
113
-
114
- Join my server for support and open source AI discussion: https://discord.gg/osai
115
-
116
- IF YOU LEAVE ALL THE PARAMETERS BELOW TO 0, IT WILL BE ON AUTO MODE, AUTOMATICALLY DETECTING THE SPEAKERS, ELSE USE THE ONES BELOW FOR MORE COSTUMIZATION & BETTER RESULTS
117
-
118
  """)
119
  audio_input = gr.Audio(type="filepath", label="Upload Audio File")
120
- num_speakers_input = gr.Number(label="Number of Speakers", info="Use it only if you know the number of speakers in advance, else leave it to 0 and use the parameters below", value=0)
121
-
122
- gr.Markdown("Use the following parameters only if you don't know the number of speakers, you can set lower and/or upper bounds on the number of speakers, if instead you know it, leave the following parameters to 0 and use the one above")
123
-
124
  min_speakers_input = gr.Number(label="Minimum Number of Speakers", value=0)
125
  max_speakers_input = gr.Number(label="Maximum Number of Speakers", value=0)
126
  process_button = gr.Button("Process")
@@ -131,5 +148,6 @@ with gr.Blocks() as demo:
131
  fn=process_audio,
132
  inputs=[audio_input, num_speakers_input, min_speakers_input, max_speakers_input],
133
  outputs=[diarization_output, label_file_link]
134
- )
135
- demo.launch(share = False)
 
 
6
 
7
  # instantiate the pipeline
8
  try:
9
+ # ื‘ื“ื™ืงื” ืื ื”-token ืงื™ื™ื ื‘ืกื‘ื™ื‘ื” ื•ื”ืื ื”ื•ื ืœื ืจื™ืง
10
+ auth_token = os.environ.get("HUGGINGFACE_READ_TOKEN")
11
+ if not auth_token:
12
+ raise ValueError("HUGGINGFACE_READ_TOKEN not found or is empty")
13
+
14
+ print("Token found, attempting to initialize pipeline...")
15
+
16
+ # ื ื™ืกื™ื•ืŸ ืœืืชื—ืœ ืืช ื”-Pipeline
17
  pipeline = Pipeline.from_pretrained(
18
  "pyannote/speaker-diarization-3.1",
19
+ use_auth_token=auth_token,
20
+ cache_dir="./cache" # ื›ื“ื™ ืœื ืกื•ืช ืœื”ืฉืชืžืฉ ื‘ืžื˜ืžื•ืŸ
21
  )
22
+
23
+ # ื”ืขื‘ืจืช ื”-Pipeline ืœ-CPU ื‘ืœื‘ื“, ื‘ื”ืชื—ืฉื‘ ืฉืืชื” ื‘ื’ืจืกื” ื—ื™ื ืžื™ืช
24
+ device = torch.device("cpu")
25
  pipeline.to(device)
26
+ print("Pipeline initialized successfully!")
27
+
28
  except Exception as e:
29
  print(f"Error initializing pipeline: {e}")
30
  pipeline = None
31
 
 
32
  def save_audio(audio):
33
  if pipeline is None:
34
  return "Error: Pipeline not initialized"
35
 
36
+ try:
37
+ # ืงืจื™ืื” ืฉืœ ืงื•ื‘ืฅ ื”ืื•ื“ื™ื• ืฉืขืœื”
38
+ with open(audio, "rb") as f:
39
+ audio_data = f.read()
40
 
41
+ # ืฉืžื™ืจื” ืฉืœ ืงื•ื‘ืฅ ื”ืื•ื“ื™ื• ื‘ืžื™ืงื•ื ื–ืžื ื™
42
+ temp_file = "temp.wav"
43
+ with open(temp_file, "wb") as f:
44
+ f.write(audio_data)
45
 
46
+ print(f"Audio file saved to {temp_file}")
47
+ return temp_file
48
+ except Exception as e:
49
+ print(f"Error saving audio file: {e}")
50
+ return None
51
 
52
  @spaces.GPU(duration=60 * 2)
53
  def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
 
55
  return "Error: Pipeline not initialized"
56
 
57
  try:
58
+ # ื”ื›ื ืช ืคืจืžื˜ืจื™ื ืœืคื™ ื”ืงืœื˜ ืฉืœ ื”ืžืฉืชืžืฉ
59
  params = {}
60
  if num_speakers > 0:
61
  params["num_speakers"] = num_speakers
 
64
  if max_speakers > 0:
65
  params["max_speakers"] = max_speakers
66
 
67
+ print(f"Processing audio file {temp_file} with parameters: {params}")
68
  diarization = pipeline(temp_file, **params)
69
+ print("Diarization completed successfully!")
70
  except Exception as e:
71
+ print(f"Error processing audio: {e}")
72
  return f"Error processing audio: {e}"
73
 
74
+ # ื”ืกืจืช ื”ืงื•ื‘ืฅ ื”ื–ืžื ื™ ืœืื—ืจ ืขื™ื‘ื•ื“
75
+ try:
76
+ os.remove(temp_file)
77
+ print(f"Temporary file {temp_file} removed successfully.")
78
+ except Exception as e:
79
+ print(f"Error removing temporary file {temp_file}: {e}")
80
 
 
81
  return str(diarization)
82
+
83
  def timestamp_to_seconds(timestamp):
84
  try:
85
+ # ื”ืžืจื” ืฉืœ timestamp ืœืฉื ื™ื•ืช
86
  h, m, s = map(float, timestamp.split(':'))
87
  return 3600 * h + 60 * m + s
88
  except ValueError as e:
 
90
  return None
91
 
92
  def generate_labels_from_diarization(diarization_output):
93
+ successful_lines = 0
94
  labels_path = 'labels.txt'
95
  try:
96
  with open(labels_path, 'w') as outfile:
 
100
  parts = line.strip()[1:-1].split(' --> ')
101
  start_time = parts[0].strip()
102
  end_time = parts[1].split(']')[0].strip()
103
+ label = line.split()[-1].strip() # ืœืงื™ื—ืช ื”ืชื•ื•ื™ืช ืžื”ืฉื•ืจื”
104
  start_seconds = timestamp_to_seconds(start_time)
105
  end_seconds = timestamp_to_seconds(end_time)
106
+ if start_seconds is None or end_seconds is None:
107
+ continue
108
  outfile.write(f"{start_seconds}\t{end_seconds}\t{label}\n")
109
  successful_lines += 1
110
  except Exception as e:
 
115
  print(f"Cannot write to file '{labels_path}'. Error: {e}")
116
  return None
117
 
 
 
118
  def process_audio(audio, num_speakers, min_speakers, max_speakers):
119
+ temp_file = save_audio(audio)
120
+ if temp_file is None:
121
+ return "Error saving audio file", None
122
+
123
+ diarization_result = diarize_audio(temp_file, num_speakers, min_speakers, max_speakers)
124
  if diarization_result.startswith("Error"):
125
+ return diarization_result, None
 
 
 
126
 
127
+ label_file = generate_labels_from_diarization(diarization_result)
128
+ return diarization_result, label_file
129
+
130
+ # ืžืžืฉืง ื’ืจื™ื™ื“ื™ื•
131
  with gr.Blocks() as demo:
132
  gr.Markdown("""
133
  # ๐Ÿ—ฃ๏ธPyannote Speaker Diarization 3.1๐Ÿ—ฃ๏ธ
 
134
  This model takes an audio file as input and outputs the diarization of the speakers in the audio.
 
135
  Please upload an audio file and adjust the parameters as needed.
136
 
 
 
137
  If you find this space helpful, please โค it.
 
 
 
 
 
138
  """)
139
  audio_input = gr.Audio(type="filepath", label="Upload Audio File")
140
+ num_speakers_input = gr.Number(label="Number of Speakers", value=0)
 
 
 
141
  min_speakers_input = gr.Number(label="Minimum Number of Speakers", value=0)
142
  max_speakers_input = gr.Number(label="Maximum Number of Speakers", value=0)
143
  process_button = gr.Button("Process")
 
148
  fn=process_audio,
149
  inputs=[audio_input, num_speakers_input, min_speakers_input, max_speakers_input],
150
  outputs=[diarization_output, label_file_link]
151
+ )
152
+
153
+ demo.launch(share=False)