File size: 5,758 Bytes
93d373e
090acab
f30c373
7c9216a
18e78ec
ba685bf
aad12fa
fc03567
ff841ad
 
 
 
 
 
 
 
fc03567
 
ff841ad
 
fc03567
ff841ad
 
 
93d373e
ff841ad
 
fc03567
 
 
1c57ed2
7180a69
fc03567
 
 
ff841ad
 
 
 
9946e7a
ff841ad
 
 
 
7180a69
ff841ad
 
 
 
 
7180a69
633b502
7180a69
 
 
0d076a1
fc03567
ff841ad
ce1f6bf
 
 
 
 
 
 
 
ff841ad
7180a69
ff841ad
fc03567
ff841ad
fc03567
0d076a1
ff841ad
 
 
 
 
 
0d076a1
aad12fa
ff841ad
28a2b9a
 
ff841ad
28a2b9a
 
 
 
 
 
 
ff841ad
28a2b9a
 
 
 
 
 
 
 
 
ff841ad
28a2b9a
 
ff841ad
 
28a2b9a
 
 
 
 
 
 
 
 
 
 
ff841ad
 
 
 
 
28a2b9a
ff841ad
1429210
ff841ad
 
 
 
db1ee1f
a599ac3
 
 
 
 
 
 
633b502
ff841ad
28a2b9a
 
2129f6b
0d076a1
28a2b9a
1429210
7180a69
28a2b9a
7180a69
28a2b9a
ff841ad
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import torch
import spaces
import gradio as gr
import os
from pyannote.audio import Pipeline

# instantiate the pipeline
try:
    # בדיקה אם ה-token קיים בסביבה והאם הוא לא ריק
    auth_token = os.environ.get("HUGGINGFACE_READ_TOKEN")
    if not auth_token:
        raise ValueError("HUGGINGFACE_READ_TOKEN not found or is empty")
    
    print("Token found, attempting to initialize pipeline...")

    # ניסיון לאתחל את ה-Pipeline
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=auth_token,
        cache_dir="./cache"  # כדי לנסות להשתמש במטמון
    )

    # העברת ה-Pipeline ל-CPU בלבד, בהתחשב שאתה בגרסה חינמית
    device = torch.device("cpu")
    pipeline.to(device)
    print("Pipeline initialized successfully!")

except Exception as e:
    print(f"Error initializing pipeline: {e}")
    pipeline = None

def save_audio(audio):
    if pipeline is None:
        return "Error: Pipeline not initialized"

    try:
        # קריאה של קובץ האודיו שעלה
        with open(audio, "rb") as f:
            audio_data = f.read()

        # שמירה של קובץ האודיו במיקום זמני
        temp_file = "temp.wav"
        with open(temp_file, "wb") as f:
            f.write(audio_data)

        print(f"Audio file saved to {temp_file}")
        return temp_file
    except Exception as e:
        print(f"Error saving audio file: {e}")
        return None

@spaces.GPU(duration=60 * 2)
def diarize_audio(temp_file, num_speakers, min_speakers, max_speakers):
    if pipeline is None:
        return "Error: Pipeline not initialized"

    try:
        # הכנת פרמטרים לפי הקלט של המשתמש
        params = {}
        if num_speakers > 0:
            params["num_speakers"] = num_speakers
        if min_speakers > 0:
            params["min_speakers"] = min_speakers
        if max_speakers > 0:
            params["max_speakers"] = max_speakers

        print(f"Processing audio file {temp_file} with parameters: {params}")
        diarization = pipeline(temp_file, **params)
        print("Diarization completed successfully!")
    except Exception as e:
        print(f"Error processing audio: {e}")
        return f"Error processing audio: {e}"

    # הסרת הקובץ הזמני לאחר עיבוד
    try:
        os.remove(temp_file)
        print(f"Temporary file {temp_file} removed successfully.")
    except Exception as e:
        print(f"Error removing temporary file {temp_file}: {e}")

    return str(diarization)

def timestamp_to_seconds(timestamp):
    try:
        # המרה של timestamp לשניות
        h, m, s = map(float, timestamp.split(':'))
        return 3600 * h + 60 * m + s
    except ValueError as e:
        print(f"Error converting timestamp to seconds: '{timestamp}'. Error: {e}")
        return None

def generate_labels_from_diarization(diarization_output):
    successful_lines = 0
    labels_path = 'labels.txt'
    try:
        with open(labels_path, 'w') as outfile:
            lines = diarization_output.strip().split('\n')
            for line in lines:
                try:
                    parts = line.strip()[1:-1].split(' --> ')
                    start_time = parts[0].strip()
                    end_time = parts[1].split(']')[0].strip()
                    label = line.split()[-1].strip()  # לקיחת התווית מהשורה
                    start_seconds = timestamp_to_seconds(start_time)
                    end_seconds = timestamp_to_seconds(end_time)
                    if start_seconds is None or end_seconds is None:
                        continue
                    outfile.write(f"{start_seconds}\t{end_seconds}\t{label}\n")
                    successful_lines += 1
                except Exception as e:
                    print(f"Error processing line: '{line.strip()}'. Error: {e}")
        print(f"Processed {successful_lines} lines successfully.")
        return labels_path if successful_lines > 0 else None
    except Exception as e:
        print(f"Cannot write to file '{labels_path}'. Error: {e}")
        return None

def process_audio(audio, num_speakers, min_speakers, max_speakers):
    temp_file = save_audio(audio)
    if temp_file is None:
        return "Error saving audio file", None

    diarization_result = diarize_audio(temp_file, num_speakers, min_speakers, max_speakers)
    if diarization_result.startswith("Error"):
        return diarization_result, None

    label_file = generate_labels_from_diarization(diarization_result)
    return diarization_result, label_file

# ממשק גריידיו
with gr.Blocks() as demo:
    gr.Markdown("""
    # 🗣️Pyannote Speaker Diarization 3.1🗣️
    This model takes an audio file as input and outputs the diarization of the speakers in the audio.
    Please upload an audio file and adjust the parameters as needed.
    
    If you find this space helpful, please ❤ it.
    """)
    audio_input = gr.Audio(type="filepath", label="Upload Audio File")
    num_speakers_input = gr.Number(label="Number of Speakers", value=0)
    min_speakers_input = gr.Number(label="Minimum Number of Speakers", value=0)
    max_speakers_input = gr.Number(label="Maximum Number of Speakers", value=0)
    process_button = gr.Button("Process")
    diarization_output = gr.Textbox(label="Diarization Output")
    label_file_link = gr.File(label="Download DAW Labels")

    process_button.click(
        fn=process_audio,
        inputs=[audio_input, num_speakers_input, min_speakers_input, max_speakers_input],
        outputs=[diarization_output, label_file_link]
    )

demo.launch(share=False)