Woziii commited on
Commit
b2a20a3
·
verified ·
1 Parent(s): 2b05a52

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -14
app.py CHANGED
@@ -49,7 +49,7 @@ pipe = pipeline(
49
 
50
 
51
 
52
- def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.02, min_segment_duration=0.05):
53
  word_segments = transcription_result['chunks']
54
  diarization_segments = list(diarization.itertracks(yield_label=True))
55
  speaker_transcription = []
@@ -60,14 +60,18 @@ def associate_speakers_with_timestamps(transcription_result, diarization, tolera
60
  def flush_current_segment():
61
  nonlocal current_speaker, current_text
62
  if current_speaker and current_text:
63
- speaker_transcription.append((current_speaker, ' '.join(current_text)))
 
 
 
 
 
64
  current_text = []
65
 
66
  for word in word_segments:
67
  word_start, word_end = word['timestamp']
68
  word_text = word['text']
69
 
70
- # Trouver le segment de diarisation correspondant
71
  matching_segment = None
72
  for segment, _, speaker in diarization_segments:
73
  if segment.start - tolerance <= word_start < segment.end + tolerance:
@@ -80,32 +84,77 @@ def associate_speakers_with_timestamps(transcription_result, diarization, tolera
80
  flush_current_segment()
81
  current_speaker = speaker
82
 
83
- # Gérer les pauses longues
84
  if word_start - last_word_end > 1.0: # Pause de plus d'une seconde
85
  flush_current_segment()
86
 
87
  current_text.append(word_text)
88
  last_word_end = word_end
89
  else:
90
- # Si aucun segment ne correspond, attribuer au dernier locuteur connu
91
  if current_speaker:
92
  current_text.append(word_text)
93
  else:
94
- # Si c'est le premier mot sans correspondance, créer un nouveau segment
95
  current_speaker = "SPEAKER_UNKNOWN"
96
  current_text.append(word_text)
97
 
98
  flush_current_segment()
99
 
100
- # Fusionner les segments courts du même locuteur
101
- merged_transcription = []
102
- for speaker, text in speaker_transcription:
103
- if not merged_transcription or merged_transcription[-1][0] != speaker or len(text.split()) > 3:
104
- merged_transcription.append((speaker, text))
105
- else:
106
- merged_transcription[-1] = (speaker, merged_transcription[-1][1] + " " + text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- return merged_transcription
 
 
 
109
 
110
  def simplify_diarization_output(speaker_transcription):
111
  simplified = []
 
49
 
50
 
51
 
52
+ def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.02, min_segment_duration=0.05, max_words_to_merge=20):
53
  word_segments = transcription_result['chunks']
54
  diarization_segments = list(diarization.itertracks(yield_label=True))
55
  speaker_transcription = []
 
60
  def flush_current_segment():
61
  nonlocal current_speaker, current_text
62
  if current_speaker and current_text:
63
+ speaker_transcription.append({
64
+ "speaker": current_speaker,
65
+ "text": ' '.join(current_text),
66
+ "start": word_segments[len(speaker_transcription)]['timestamp'][0],
67
+ "end": word_segments[len(speaker_transcription) + len(current_text) - 1]['timestamp'][1]
68
+ })
69
  current_text = []
70
 
71
  for word in word_segments:
72
  word_start, word_end = word['timestamp']
73
  word_text = word['text']
74
 
 
75
  matching_segment = None
76
  for segment, _, speaker in diarization_segments:
77
  if segment.start - tolerance <= word_start < segment.end + tolerance:
 
84
  flush_current_segment()
85
  current_speaker = speaker
86
 
 
87
  if word_start - last_word_end > 1.0: # Pause de plus d'une seconde
88
  flush_current_segment()
89
 
90
  current_text.append(word_text)
91
  last_word_end = word_end
92
  else:
 
93
  if current_speaker:
94
  current_text.append(word_text)
95
  else:
 
96
  current_speaker = "SPEAKER_UNKNOWN"
97
  current_text.append(word_text)
98
 
99
  flush_current_segment()
100
 
101
+ def detect_interruptions(transcription, time_threshold=0.5):
102
+ for i in range(len(transcription) - 1):
103
+ current_end = transcription[i]['end']
104
+ next_start = transcription[i+1]['start']
105
+ if next_start - current_end < time_threshold:
106
+ transcription[i]['text'] += ' [...]'
107
+ transcription[i+1]['text'] = '[...] ' + transcription[i+1]['text']
108
+ return transcription
109
+
110
+ speaker_transcription = detect_interruptions(speaker_transcription)
111
+
112
+ def post_process_transcription(transcription, max_words):
113
+ processed = []
114
+ current_speaker = None
115
+ current_text = []
116
+ current_start = None
117
+ current_end = None
118
+
119
+ for segment in transcription:
120
+ if segment['speaker'] == current_speaker and len(' '.join(current_text + [segment['text']]).split()) <= max_words:
121
+ current_text.append(segment['text'])
122
+ current_end = segment['end']
123
+ else:
124
+ if current_speaker:
125
+ processed.append({
126
+ "speaker": current_speaker,
127
+ "text": ' '.join(current_text),
128
+ "start": current_start,
129
+ "end": current_end
130
+ })
131
+ current_speaker = segment['speaker']
132
+ current_text = [segment['text']]
133
+ current_start = segment['start']
134
+ current_end = segment['end']
135
+
136
+ if current_speaker:
137
+ processed.append({
138
+ "speaker": current_speaker,
139
+ "text": ' '.join(current_text),
140
+ "start": current_start,
141
+ "end": current_end
142
+ })
143
+
144
+ return processed
145
+
146
+ merged_transcription = post_process_transcription(speaker_transcription, max_words_to_merge)
147
+
148
+ speakers = sorted(set(segment['speaker'] for segment in merged_transcription))
149
+ metadata = {
150
+ "speaker_count": len(speakers),
151
+ "speakers": speakers
152
+ }
153
 
154
+ return {
155
+ "transcription": merged_transcription,
156
+ "metadata": metadata
157
+ }
158
 
159
  def simplify_diarization_output(speaker_transcription):
160
  simplified = []