Spaces:

clement-pages
/

pyannote-speech-separation-pipeline

Running

App Files Files Community

clement-pages commited on Dec 4, 2024

Commit

f0061fc

1 Parent(s): 074b1c4

wip: allow to use any pyannote pipeline

Browse files

Files changed (4) hide show

pyannote_viewer/backend/pyannote_viewer/pyannote_viewer.py +24 -12
pyannote_viewer/demo/app.py +12 -2
pyannote_viewer/frontend/Index.svelte +1 -1
pyannote_viewer/frontend/player/AudioPlayer.svelte +1 -0

pyannote_viewer/backend/pyannote_viewer/pyannote_viewer.py CHANGED Viewed

@@ -19,7 +19,9 @@ from gradio.events import Events
 from gradio.exceptions import Error
 from pyannote.core.annotation import Annotation
 @dataclasses.dataclass
 class WaveformOptions:
@@ -249,7 +251,7 @@ class PyannoteViewer(
             )
     def postprocess(
-        self, value: Tuple[Annotation, np.ndarray] | None
     ) -> FileData | bytes | None:
         """
         Parameters:
@@ -260,30 +262,40 @@ class PyannoteViewer(
         if value is None:
             return None
-        annotations, sources = value
         labels = annotations.labels()
         # format diarization output
         segments = []
         for segment, _, label in annotations.itertracks(yield_label=True):
-            label_idx = labels.index(label)
             segments.append(
                 Segment(start=segment.start, end=segment.end, channel=label_idx)
             )
-        # save sources in cache
-        source_filepath = processing_utils.save_audio_to_cache(
-            data=sources.data,
-            sample_rate=16_000,
-            format=self.format,
-            cache_dir=self.GRADIO_CACHE,
-        )
-        orig_name = Path(source_filepath).name
         return {
             "segments": segments,
             "labels": labels,
-            "sources_file": FileData(path=source_filepath, orig_name=orig_name),
         }
     def stream_output(

 from gradio.exceptions import Error
 from pyannote.core.annotation import Annotation
+from pyannote.core.feature import SlidingWindowFeature
+import torchaudio
 @dataclasses.dataclass
 class WaveformOptions:
             )
     def postprocess(
+        self, value: Tuple[Annotation, np.ndarray | Path | str] | None
     ) -> FileData | bytes | None:
         """
         Parameters:
         if value is None:
             return None
+        annotations, audio = value
         labels = annotations.labels()
         # format diarization output
         segments = []
         for segment, _, label in annotations.itertracks(yield_label=True):
+            label_idx = labels.index(label) if isinstance(audio, SlidingWindowFeature) else 0
             segments.append(
                 Segment(start=segment.start, end=segment.end, channel=label_idx)
             )
+        if isinstance(audio, SlidingWindowFeature):
+            # save sources in cache
+            audio_filepath = processing_utils.save_audio_to_cache(
+                data=audio.data,
+                sample_rate=16_000,
+                format=self.format,
+                cache_dir=self.GRADIO_CACHE,
+            )
+            multichannel = True
+        elif isinstance(audio, (Path, str)):
+            audio_filepath = audio
+            multichannel = False
+        else:
+            raise ValueError("Unknown type for audio value")
+        orig_name = Path(audio_filepath).name
         return {
             "segments": segments,
             "labels": labels,
+            "multichannel": multichannel,
+            "sources_file": FileData(path=audio_filepath, orig_name=orig_name),
         }
     def stream_output(

pyannote_viewer/demo/app.py CHANGED Viewed

@@ -5,10 +5,20 @@ import os
 def apply_pipeline(audio: str) -> tuple:
     pipeline = Pipeline.from_pretrained(
-        "pyannote/speech-separation-ami-1.0", use_auth_token=os.environ["HF_TOKEN"]
     )
-    return pipeline(audio)
 with gr.Blocks() as demo:

 def apply_pipeline(audio: str) -> tuple:
+    # pipeline = Pipeline.from_pretrained(
+    #     "pyannote/speech-separation-ami-1.0", use_auth_token=os.environ["HF_TOKEN"]
+    # )
     pipeline = Pipeline.from_pretrained(
+        "pyannote/speaker-diarization-3.1", use_auth_token=os.environ["HF_TOKEN"]
     )
+    outputs = pipeline(audio)
+    if isinstance(outputs, tuple):
+        return outputs
+    else:
+        return (outputs, audio)
 with gr.Blocks() as demo:

pyannote_viewer/frontend/Index.svelte CHANGED Viewed

@@ -16,7 +16,7 @@
 	export let elem_classes: string[] = [];
 	export let visible = true;
 	export let interactive: boolean;
-	export let value: null | {"segments": Segment[], "labels" : string[], "sources_file": FileData} = null;
 	export let sources:
 		| ["microphone"]
 		| ["upload"]

 	export let elem_classes: string[] = [];
 	export let visible = true;
 	export let interactive: boolean;
+	export let value: null | {"segments": Segment[], "labels" : string[], "multichannel": boolean, "sources_file": FileData} = null;
 	export let sources:
 		| ["microphone"]
 		| ["upload"]

pyannote_viewer/frontend/player/AudioPlayer.svelte CHANGED Viewed

@@ -72,6 +72,7 @@
 	$: waveform?.on("decode", (duration: any) => {
 		audioDecoded = true;
 		const numChannels = waveform.getDecodedData().numberOfChannels;
 		audio_duration = duration;
 		durationRef && (durationRef.textContent = format_time(duration));

 	$: waveform?.on("decode", (duration: any) => {
 		audioDecoded = true;
 		const numChannels = waveform.getDecodedData().numberOfChannels;
+		console.log(numChannels);
 		audio_duration = duration;
 		durationRef && (durationRef.textContent = format_time(duration));