pyannote_diarization-3.1

Running

App Files Files Community

Nitzantry1 commited on Nov 22, 2024

Commit

ff841ad

verified ·

1 Parent(s): 1f97d8b

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -40

app.py CHANGED Viewed

@@ -6,31 +6,48 @@ from pyannote.audio import Pipeline
 # instantiate the pipeline
 try:
     pipeline = Pipeline.from_pretrained(
         "pyannote/speaker-diarization-3.1",
-        use_auth_token=os.environ["HUGGINGFACE_READ_TOKEN"]
     )
-    # Move the pipeline to the GPU
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     pipeline.to(device)
 except Exception as e:
     print(f"Error initializing pipeline: {e}")
     pipeline = None
 def save_audio(audio):
     if pipeline is None:
         return "Error: Pipeline not initialized"
-    # Read the uploaded audio file as bytes
-    with open(audio, "rb") as f:
-        audio_data = f.read()
-    # Save the uploaded audio file to a temporary location
-    with open("temp.wav", "wb") as f:
-        f.write(audio_data)
-    return "temp.wav"
 @spaces.GPU(duration=60 * 2)
 def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
@@ -38,6 +55,7 @@ def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
         return "Error: Pipeline not initialized"
     try:
         params = {}
         if num_speakers > 0:
             params["num_speakers"] = num_speakers
@@ -46,19 +64,25 @@ def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
         if max_speakers > 0:
             params["max_speakers"] = max_speakers
         diarization = pipeline(temp_file, **params)
     except Exception as e:
         return f"Error processing audio: {e}"
-    # Remove the temporary file
-    os.remove(temp_file)
-    # Return the diarization output
     return str(diarization)
 def timestamp_to_seconds(timestamp):
     try:
-        # Extracts hour, minute, and second from timestamp and converts to total seconds
         h, m, s = map(float, timestamp.split(':'))
         return 3600 * h + 60 * m + s
     except ValueError as e:
@@ -66,7 +90,7 @@ def timestamp_to_seconds(timestamp):
         return None
 def generate_labels_from_diarization(diarization_output):
-    successful_lines = 0  # Counter for successfully processed lines
     labels_path = 'labels.txt'
     try:
         with open(labels_path, 'w') as outfile:
@@ -76,9 +100,11 @@ def generate_labels_from_diarization(diarization_output):
                     parts = line.strip()[1:-1].split(' --> ')
                     start_time = parts[0].strip()
                     end_time = parts[1].split(']')[0].strip()
-                    label = line.split()[-1].strip()  # Extracting the last word as label
                     start_seconds = timestamp_to_seconds(start_time)
                     end_seconds = timestamp_to_seconds(end_time)
                     outfile.write(f"{start_seconds}\t{end_seconds}\t{label}\n")
                     successful_lines += 1
                 except Exception as e:
@@ -89,38 +115,29 @@ def generate_labels_from_diarization(diarization_output):
         print(f"Cannot write to file '{labels_path}'. Error: {e}")
         return None
 def process_audio(audio, num_speakers, min_speakers, max_speakers):
-    diarization_result = diarize_audio(save_audio(audio), num_speakers, min_speakers, max_speakers)
     if diarization_result.startswith("Error"):
-        return diarization_result, None  # Return None for label file link if there's an error
-    else:
-        label_file = generate_labels_from_diarization(diarization_result)
-        return diarization_result, label_file
 with gr.Blocks() as demo:
     gr.Markdown("""
     # 🗣️Pyannote Speaker Diarization 3.1🗣️
     This model takes an audio file as input and outputs the diarization of the speakers in the audio.
     Please upload an audio file and adjust the parameters as needed.
-    The maximum length of the audio file that can be processed depends based on the hardware it's running on. If you are on the ZeroGPU HuggingFace Space, it's around **35-40 minutes**.
     If you find this space helpful, please ❤ it.
-    Join my server for support and open source AI discussion: https://discord.gg/osai
-    IF YOU LEAVE ALL THE PARAMETERS BELOW TO 0, IT WILL BE ON AUTO MODE, AUTOMATICALLY DETECTING THE SPEAKERS, ELSE USE THE ONES BELOW FOR MORE COSTUMIZATION & BETTER RESULTS
     """)
     audio_input = gr.Audio(type="filepath", label="Upload Audio File")
-    num_speakers_input = gr.Number(label="Number of Speakers", info="Use it only if you know the number of speakers in advance, else leave it to 0 and use the parameters below", value=0)
-    gr.Markdown("Use the following parameters only if you don't know the number of speakers, you can set lower and/or upper bounds on the number of speakers, if instead you know it, leave the following parameters to 0 and use the one above")
     min_speakers_input = gr.Number(label="Minimum Number of Speakers", value=0)
     max_speakers_input = gr.Number(label="Maximum Number of Speakers", value=0)
     process_button = gr.Button("Process")
@@ -131,5 +148,6 @@ with gr.Blocks() as demo:
         fn=process_audio,
         inputs=[audio_input, num_speakers_input, min_speakers_input, max_speakers_input],
         outputs=[diarization_output, label_file_link]
-)
-demo.launch(share = False)

 # instantiate the pipeline
 try:
+    # בדיקה אם ה-token קיים בסביבה והאם הוא לא ריק
+    auth_token = os.environ.get("HUGGINGFACE_READ_TOKEN")
+    if not auth_token:
+        raise ValueError("HUGGINGFACE_READ_TOKEN not found or is empty")
+    print("Token found, attempting to initialize pipeline...")
+    # ניסיון לאתחל את ה-Pipeline
     pipeline = Pipeline.from_pretrained(
         "pyannote/speaker-diarization-3.1",
+        use_auth_token=auth_token,
+        cache_dir="./cache"  # כדי לנסות להשתמש במטמון
     )
+    # העברת ה-Pipeline ל-CPU בלבד, בהתחשב שאתה בגרסה חינמית
+    device = torch.device("cpu")
     pipeline.to(device)
+    print("Pipeline initialized successfully!")
 except Exception as e:
     print(f"Error initializing pipeline: {e}")
     pipeline = None
 def save_audio(audio):
     if pipeline is None:
         return "Error: Pipeline not initialized"
+    try:
+        # קריאה של קובץ האודיו שעלה
+        with open(audio, "rb") as f:
+            audio_data = f.read()
+        # שמירה של קובץ האודיו במיקום זמני
+        temp_file = "temp.wav"
+        with open(temp_file, "wb") as f:
+            f.write(audio_data)
+        print(f"Audio file saved to {temp_file}")
+        return temp_file
+    except Exception as e:
+        print(f"Error saving audio file: {e}")
+        return None
 @spaces.GPU(duration=60 * 2)
 def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
         return "Error: Pipeline not initialized"
     try:
+        # הכנת פרמטרים לפי הקלט של המשתמש
         params = {}
         if num_speakers > 0:
             params["num_speakers"] = num_speakers
         if max_speakers > 0:
             params["max_speakers"] = max_speakers
+        print(f"Processing audio file {temp_file} with parameters: {params}")
         diarization = pipeline(temp_file, **params)
+        print("Diarization completed successfully!")
     except Exception as e:
+        print(f"Error processing audio: {e}")
         return f"Error processing audio: {e}"
+    # הסרת הקובץ הזמני לאחר עיבוד
+    try:
+        os.remove(temp_file)
+        print(f"Temporary file {temp_file} removed successfully.")
+    except Exception as e:
+        print(f"Error removing temporary file {temp_file}: {e}")
     return str(diarization)
 def timestamp_to_seconds(timestamp):
     try:
+        # המרה של timestamp לשניות
         h, m, s = map(float, timestamp.split(':'))
         return 3600 * h + 60 * m + s
     except ValueError as e:
         return None
 def generate_labels_from_diarization(diarization_output):
+    successful_lines = 0
     labels_path = 'labels.txt'
     try:
         with open(labels_path, 'w') as outfile:
                     parts = line.strip()[1:-1].split(' --> ')
                     start_time = parts[0].strip()
                     end_time = parts[1].split(']')[0].strip()
+                    label = line.split()[-1].strip()  # לקיחת התווית מהשורה
                     start_seconds = timestamp_to_seconds(start_time)
                     end_seconds = timestamp_to_seconds(end_time)
+                    if start_seconds is None or end_seconds is None:
+                        continue
                     outfile.write(f"{start_seconds}\t{end_seconds}\t{label}\n")
                     successful_lines += 1
                 except Exception as e:
         print(f"Cannot write to file '{labels_path}'. Error: {e}")
         return None
 def process_audio(audio, num_speakers, min_speakers, max_speakers):
+    temp_file = save_audio(audio)
+    if temp_file is None:
+        return "Error saving audio file", None
+    diarization_result = diarize_audio(temp_file, num_speakers, min_speakers, max_speakers)
     if diarization_result.startswith("Error"):
+        return diarization_result, None
+    label_file = generate_labels_from_diarization(diarization_result)
+    return diarization_result, label_file
+# ממשק גריידיו
 with gr.Blocks() as demo:
     gr.Markdown("""
     # 🗣️Pyannote Speaker Diarization 3.1🗣️
     This model takes an audio file as input and outputs the diarization of the speakers in the audio.
     Please upload an audio file and adjust the parameters as needed.
     If you find this space helpful, please ❤ it.
     """)
     audio_input = gr.Audio(type="filepath", label="Upload Audio File")
+    num_speakers_input = gr.Number(label="Number of Speakers", value=0)
     min_speakers_input = gr.Number(label="Minimum Number of Speakers", value=0)
     max_speakers_input = gr.Number(label="Maximum Number of Speakers", value=0)
     process_button = gr.Button("Process")
         fn=process_audio,
         inputs=[audio_input, num_speakers_input, min_speakers_input, max_speakers_input],
         outputs=[diarization_output, label_file_link]
+    )
+demo.launch(share=False)