Ngoufack commited on
Commit
e0dc9fc
·
1 Parent(s): e285f0a
Files changed (1) hide show
  1. app.py +12 -8
app.py CHANGED
@@ -11,7 +11,7 @@ from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
11
  from pyannote.audio import Model
12
  from pyannote.core import Segment
13
  from transformers.pipelines.audio_utils import ffmpeg_read
14
-
15
 
16
  MODEL_NAME = "medium"
17
  BATCH_SIZE = 8
@@ -21,8 +21,10 @@ YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
  model = WhisperModel(MODEL_NAME, device=device, compute_type="float16" if torch.cuda.is_available() else "int8")
23
 
24
- model_pyannote = Model.from_pretrained("pyannote/speaker-diarization")
25
- pipeline = SpeakerDiarization(model_pyannote)
 
 
26
 
27
  @spaces.GPU
28
  def transcribe(inputs, task):
@@ -31,11 +33,13 @@ def transcribe(inputs, task):
31
 
32
  segments, _ = model.transcribe(inputs, task=task)
33
  text = " ".join([segment.text for segment in segments])
34
- waveform, sample_rate = torchaudio.load(audio_path)
35
- if sample_rate != 16000:
36
- transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
37
- waveform = transform(waveform)
38
-
 
 
39
  #diarization = pipeline({"uri": "audio", "audio": audio_path})
40
  #speaker_segments = []
41
  return text
 
11
  from pyannote.audio import Model
12
  from pyannote.core import Segment
13
  from transformers.pipelines.audio_utils import ffmpeg_read
14
+ from pyannote.audio import Pipeline
15
 
16
  MODEL_NAME = "medium"
17
  BATCH_SIZE = 8
 
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
  model = WhisperModel(MODEL_NAME, device=device, compute_type="float16" if torch.cuda.is_available() else "int8")
23
 
24
+ #model_pyannote = Model.from_pretrained("pyannote/speaker-diarization")
25
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
26
+
27
+ #pipeline = SpeakerDiarization(model_pyannote)
28
 
29
  @spaces.GPU
30
  def transcribe(inputs, task):
 
33
 
34
  segments, _ = model.transcribe(inputs, task=task)
35
  text = " ".join([segment.text for segment in segments])
36
+ diarization = pipeline(inputs)
37
+ speaker_segments = []
38
+ for segment, _, speaker in diarization.itertracks(yield_label=True):
39
+ speaker_segments.append((segment.start, segment.end, speaker))
40
+
41
+ # Associer les segments de transcription aux locuteurs
42
+ speaker_texts = []
43
  #diarization = pipeline({"uri": "audio", "audio": audio_path})
44
  #speaker_segments = []
45
  return text