KIFF commited on
Commit
c2d200c
·
verified ·
1 Parent(s): d4fd773

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +2 -47
handler.py CHANGED
@@ -1,47 +1,2 @@
1
- from typing import Dict
2
- from pyannote.audio import Pipeline
3
- import torch
4
- import base64
5
- import numpy as np
6
-
7
- SAMPLE_RATE = 16000
8
-
9
- class EndpointHandler():
10
- def __init__(self, path=""):
11
- # load the model
12
- self.pipeline = Pipeline.from_pretrained("KIFF/pyannote-speaker-diarization-endpoint")
13
-
14
- def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
15
- """
16
- Args:
17
- data (:obj:):
18
- includes the deserialized audio file as bytes
19
- Return:
20
- A :obj:`dict`:. base64 encoded image
21
- """
22
- # process input
23
- inputs = data.pop("inputs", data)
24
- parameters = data.pop("parameters", None) # min_speakers=2, max_speakers=5
25
-
26
- # decode the base64 audio data
27
- audio_data = base64.b64decode(inputs)
28
- audio_nparray = np.frombuffer(audio_data, dtype=np.int16)
29
-
30
- # prepare pynannote input
31
- audio_tensor= torch.from_numpy(audio_nparray).float().unsqueeze(0)
32
- pyannote_input = {"waveform": audio_tensor, "sample_rate": SAMPLE_RATE}
33
-
34
- # apply pretrained pipeline
35
- # pass inputs with all kwargs in data
36
- if parameters is not None:
37
- diarization = self.pipeline(pyannote_input, **parameters)
38
- else:
39
- diarization = self.pipeline(pyannote_input)
40
-
41
- # postprocess the prediction
42
- processed_diarization = [
43
- {"label": str(label), "start": str(segment.start), "stop": str(segment.end)}
44
- for segment, _, label in diarization.itertracks(yield_label=True)
45
- ]
46
-
47
- return {"diarization": processed_diarization}
 
1
+ torch==1.11.0
2
+ git+https://github.com/philschmid/pyannote-audio.[email protected]