clement-pages commited on
Commit
f0061fc
·
1 Parent(s): 074b1c4

wip: allow to use any pyannote pipeline

Browse files
pyannote_viewer/backend/pyannote_viewer/pyannote_viewer.py CHANGED
@@ -19,7 +19,9 @@ from gradio.events import Events
19
  from gradio.exceptions import Error
20
 
21
  from pyannote.core.annotation import Annotation
 
22
 
 
23
 
24
  @dataclasses.dataclass
25
  class WaveformOptions:
@@ -249,7 +251,7 @@ class PyannoteViewer(
249
  )
250
 
251
  def postprocess(
252
- self, value: Tuple[Annotation, np.ndarray] | None
253
  ) -> FileData | bytes | None:
254
  """
255
  Parameters:
@@ -260,30 +262,40 @@ class PyannoteViewer(
260
  if value is None:
261
  return None
262
 
263
- annotations, sources = value
 
264
  labels = annotations.labels()
265
 
266
  # format diarization output
267
  segments = []
268
  for segment, _, label in annotations.itertracks(yield_label=True):
269
- label_idx = labels.index(label)
270
  segments.append(
271
  Segment(start=segment.start, end=segment.end, channel=label_idx)
272
  )
273
 
274
- # save sources in cache
275
- source_filepath = processing_utils.save_audio_to_cache(
276
- data=sources.data,
277
- sample_rate=16_000,
278
- format=self.format,
279
- cache_dir=self.GRADIO_CACHE,
280
- )
281
- orig_name = Path(source_filepath).name
 
 
 
 
 
 
 
 
282
 
283
  return {
284
  "segments": segments,
285
  "labels": labels,
286
- "sources_file": FileData(path=source_filepath, orig_name=orig_name),
 
287
  }
288
 
289
  def stream_output(
 
19
  from gradio.exceptions import Error
20
 
21
  from pyannote.core.annotation import Annotation
22
+ from pyannote.core.feature import SlidingWindowFeature
23
 
24
+ import torchaudio
25
 
26
  @dataclasses.dataclass
27
  class WaveformOptions:
 
251
  )
252
 
253
  def postprocess(
254
+ self, value: Tuple[Annotation, np.ndarray | Path | str] | None
255
  ) -> FileData | bytes | None:
256
  """
257
  Parameters:
 
262
  if value is None:
263
  return None
264
 
265
+ annotations, audio = value
266
+
267
  labels = annotations.labels()
268
 
269
  # format diarization output
270
  segments = []
271
  for segment, _, label in annotations.itertracks(yield_label=True):
272
+ label_idx = labels.index(label) if isinstance(audio, SlidingWindowFeature) else 0
273
  segments.append(
274
  Segment(start=segment.start, end=segment.end, channel=label_idx)
275
  )
276
 
277
+ if isinstance(audio, SlidingWindowFeature):
278
+ # save sources in cache
279
+ audio_filepath = processing_utils.save_audio_to_cache(
280
+ data=audio.data,
281
+ sample_rate=16_000,
282
+ format=self.format,
283
+ cache_dir=self.GRADIO_CACHE,
284
+ )
285
+ multichannel = True
286
+ elif isinstance(audio, (Path, str)):
287
+ audio_filepath = audio
288
+ multichannel = False
289
+ else:
290
+ raise ValueError("Unknown type for audio value")
291
+
292
+ orig_name = Path(audio_filepath).name
293
 
294
  return {
295
  "segments": segments,
296
  "labels": labels,
297
+ "multichannel": multichannel,
298
+ "sources_file": FileData(path=audio_filepath, orig_name=orig_name),
299
  }
300
 
301
  def stream_output(
pyannote_viewer/demo/app.py CHANGED
@@ -5,10 +5,20 @@ import os
5
 
6
 
7
  def apply_pipeline(audio: str) -> tuple:
 
 
 
 
8
  pipeline = Pipeline.from_pretrained(
9
- "pyannote/speech-separation-ami-1.0", use_auth_token=os.environ["HF_TOKEN"]
10
  )
11
- return pipeline(audio)
 
 
 
 
 
 
12
 
13
 
14
  with gr.Blocks() as demo:
 
5
 
6
 
7
  def apply_pipeline(audio: str) -> tuple:
8
+ # pipeline = Pipeline.from_pretrained(
9
+ # "pyannote/speech-separation-ami-1.0", use_auth_token=os.environ["HF_TOKEN"]
10
+ # )
11
+
12
  pipeline = Pipeline.from_pretrained(
13
+ "pyannote/speaker-diarization-3.1", use_auth_token=os.environ["HF_TOKEN"]
14
  )
15
+
16
+
17
+ outputs = pipeline(audio)
18
+ if isinstance(outputs, tuple):
19
+ return outputs
20
+ else:
21
+ return (outputs, audio)
22
 
23
 
24
  with gr.Blocks() as demo:
pyannote_viewer/frontend/Index.svelte CHANGED
@@ -16,7 +16,7 @@
16
  export let elem_classes: string[] = [];
17
  export let visible = true;
18
  export let interactive: boolean;
19
- export let value: null | {"segments": Segment[], "labels" : string[], "sources_file": FileData} = null;
20
  export let sources:
21
  | ["microphone"]
22
  | ["upload"]
 
16
  export let elem_classes: string[] = [];
17
  export let visible = true;
18
  export let interactive: boolean;
19
+ export let value: null | {"segments": Segment[], "labels" : string[], "multichannel": boolean, "sources_file": FileData} = null;
20
  export let sources:
21
  | ["microphone"]
22
  | ["upload"]
pyannote_viewer/frontend/player/AudioPlayer.svelte CHANGED
@@ -72,6 +72,7 @@
72
  $: waveform?.on("decode", (duration: any) => {
73
  audioDecoded = true;
74
  const numChannels = waveform.getDecodedData().numberOfChannels;
 
75
  audio_duration = duration;
76
  durationRef && (durationRef.textContent = format_time(duration));
77
 
 
72
  $: waveform?.on("decode", (duration: any) => {
73
  audioDecoded = true;
74
  const numChannels = waveform.getDecodedData().numberOfChannels;
75
+ console.log(numChannels);
76
  audio_duration = duration;
77
  durationRef && (durationRef.textContent = format_time(duration));
78