Woziii commited on
Commit
02ae966
·
verified ·
1 Parent(s): 3ad44f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -25
app.py CHANGED
@@ -47,42 +47,63 @@ pipe = pipeline(
47
  )
48
 
49
 
50
-
51
-
52
  def associate_speakers_with_timestamps(transcription_result, diarization):
53
  word_segments = transcription_result['chunks']
54
  diarization_segments = list(diarization.itertracks(yield_label=True))
 
 
 
 
 
 
 
55
  speaker_transcription = []
56
- current_segment_index = 0
57
- previous_speaker = None
 
 
 
58
 
59
  for word in word_segments:
60
- word_start, word_end = word['timestamp']
61
-
62
- # Trouver le segment de diarisation correspondant
63
- while current_segment_index < len(diarization_segments) - 1 and diarization_segments[current_segment_index][0].end <= word_start:
64
- current_segment_index += 1
65
-
66
- current_segment, _, current_speaker = diarization_segments[current_segment_index]
67
-
68
- # Vérifier s'il y a un chevauchement avec le segment précédent
69
- if current_segment_index > 0:
70
- previous_segment, _, previous_speaker = diarization_segments[current_segment_index - 1]
71
- if previous_segment.end > word_start and previous_speaker != current_speaker:
72
- word_text = f"[{word['text']}]"
73
- else:
74
- word_text = word['text']
75
  else:
76
- word_text = word['text']
77
 
78
- # Ajouter le mot au segment du locuteur actuel
79
- if not speaker_transcription or speaker_transcription[-1][0] != current_speaker:
80
- speaker_transcription.append((current_speaker, word_text))
81
- else:
82
- speaker_transcription[-1] = (current_speaker, speaker_transcription[-1][1] + " " + word_text)
83
 
84
  return speaker_transcription
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def merge_short_segments(speaker_transcription, min_words=3):
87
  merged = []
88
  for speaker, text in speaker_transcription:
 
47
  )
48
 
49
 
 
 
50
  def associate_speakers_with_timestamps(transcription_result, diarization):
51
  word_segments = transcription_result['chunks']
52
  diarization_segments = list(diarization.itertracks(yield_label=True))
53
+
54
+ # Pré-calcul des chevauchements
55
+ overlaps = set()
56
+ for i in range(1, len(diarization_segments)):
57
+ if diarization_segments[i-1][0].end > diarization_segments[i][0].start:
58
+ overlaps.add(diarization_segments[i][0].start)
59
+
60
  speaker_transcription = []
61
+ current_speaker = None
62
+ current_text = []
63
+
64
+ diar_iter = iter(diarization_segments)
65
+ current_segment, _, current_speaker = next(diar_iter)
66
 
67
  for word in word_segments:
68
+ word_start, _ = word['timestamp']
69
+
70
+ while word_start >= current_segment.end:
71
+ if current_text:
72
+ speaker_transcription.append((current_speaker, ' '.join(current_text)))
73
+ current_text = []
74
+ current_segment, _, current_speaker = next(diar_iter, (None, None, None))
75
+ if current_segment is None:
76
+ break
77
+
78
+ if current_segment is None:
79
+ break
80
+
81
+ if word_start in overlaps:
82
+ current_text.append(f"[{word['text']}]")
83
  else:
84
+ current_text.append(word['text'])
85
 
86
+ if current_text:
87
+ speaker_transcription.append((current_speaker, ' '.join(current_text)))
 
 
 
88
 
89
  return speaker_transcription
90
 
91
+ def merge_short_segments(speaker_transcription, min_words=3):
92
+ def merge_group(group):
93
+ speaker, texts = group
94
+ merged = []
95
+ current = []
96
+ for text in texts:
97
+ current.extend(text.split())
98
+ if len(current) >= min_words:
99
+ merged.append(' '.join(current))
100
+ current = []
101
+ if current:
102
+ merged.append(' '.join(current))
103
+ return [(speaker, text) for text in merged]
104
+
105
+ return [item for group in groupby(speaker_transcription, key=lambda x: x[0]) for item in merge_group(group)]
106
+
107
  def merge_short_segments(speaker_transcription, min_words=3):
108
  merged = []
109
  for speaker, text in speaker_transcription: