File size: 1,723 Bytes
e58bff7
d9e71ec
 
 
e58bff7
d9e71ec
e58bff7
c82bffb
d9e71ec
e58bff7
d9e71ec
c82bffb
 
d9e71ec
c82bffb
 
 
 
e58bff7
 
 
c82bffb
4d2cffe
 
 
 
d9e71ec
 
4d2cffe
d9e71ec
 
c82bffb
 
 
d9e71ec
4d2cffe
 
 
c82bffb
d9e71ec
c82bffb
 
e58bff7
d9e71ec
e58bff7
d9e71ec
4d2cffe
d9e71ec
e58bff7
d9e71ec
e58bff7
 
d9e71ec
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import tempfile
import gradio as gr
from pyannote.audio import Pipeline

# ืฉืœื™ืคืช Hugging Face Token ืžื”-Secret
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise ValueError("HF_TOKEN is missing. Please set it in the Secrets section.")

# ื˜ืขื™ื ืช ืžื•ื“ืœ pyannote ืœื–ื™ื”ื•ื™ ื“ื•ื‘ืจื™ื
try:
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization",
        use_auth_token=hf_token
    )
except Exception as e:
    raise RuntimeError(f"Failed to load the pipeline: {e}")

# ืคื•ื ืงืฆื™ื” ืœื–ื™ื”ื•ื™ ื“ื•ื‘ืจื™ื
def diarize(audio):
    try:
        # ืฉืžื™ืจืช ื”ืื•ื“ื™ื• ืœืงื•ื‘ืฅ ื–ืžื ื™
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
            temp_audio.write(audio.read())
            temp_audio_path = temp_audio.name

        # ืขื™ื‘ื•ื“ ื”ืื•ื“ื™ื• ืขื pyannote
        diarization = pipeline(temp_audio_path)

        # ืขื™ื‘ื•ื“ ื”ืชื•ืฆืื” ืœื–ื™ื”ื•ื™ ื“ื•ื‘ืจื™ื
        result = []
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            result.append(f"{speaker}: {turn.start:.1f}s - {turn.end:.1f}s")

        # ืžื—ื™ืงืช ื”ืงื•ื‘ืฅ ื”ื–ืžื ื™ ืœืื—ืจ ื”ืฉื™ืžื•ืฉ
        os.remove(temp_audio_path)

        return "\n".join(result)

    except Exception as e:
        return f"Error during diarization: {e}"

# ื™ืฆื™ืจืช ืžืžืฉืง Gradio
interface = gr.Interface(
    fn=diarize,
    inputs=gr.inputs.Audio(source="upload", type="file"),
    outputs="text",
    title="Speaker Diarization",
    description="Upload an audio file (WAV, MP3, etc.) to detect speakers and their timestamps."
)

# ื”ืคืขืœืช ื”ืžืžืฉืง
if __name__ == "__main__":
    interface.launch()