Update app.py
Browse files
app.py
CHANGED
@@ -49,61 +49,58 @@ pipe = pipeline(
|
|
49 |
|
50 |
|
51 |
|
52 |
-
def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.
|
53 |
word_segments = transcription_result['chunks']
|
54 |
diarization_segments = list(diarization.itertracks(yield_label=True))
|
55 |
speaker_transcription = []
|
56 |
current_speaker = None
|
57 |
current_text = []
|
58 |
-
|
59 |
-
last_segment_index = 0
|
60 |
|
61 |
def flush_current_segment():
|
62 |
nonlocal current_speaker, current_text
|
63 |
if current_speaker and current_text:
|
64 |
-
|
65 |
-
if segment_duration >= min_segment_duration:
|
66 |
-
speaker_transcription.append((current_speaker, ' '.join(current_text)))
|
67 |
-
else:
|
68 |
-
unassigned_words.extend([(word['timestamp'][0], word['text']) for word in word_segments])
|
69 |
current_text = []
|
70 |
|
71 |
for word in word_segments:
|
72 |
word_start, word_end = word['timestamp']
|
73 |
word_text = word['text']
|
74 |
-
assigned = False
|
75 |
|
76 |
-
|
77 |
-
|
|
|
78 |
if segment.start - tolerance <= word_start < segment.end + tolerance:
|
79 |
-
|
80 |
-
flush_current_segment()
|
81 |
-
current_speaker = speaker
|
82 |
-
current_text.append(word_text)
|
83 |
-
last_segment_index = i
|
84 |
-
assigned = True
|
85 |
break
|
86 |
|
87 |
-
if
|
88 |
-
|
|
|
|
|
|
|
89 |
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
-
# Traitement des mots non assignés
|
93 |
-
unassigned_words.sort(key=lambda x: x[0]) # Trier par timestamp
|
94 |
-
for word_start, word_text in unassigned_words:
|
95 |
-
closest_segment = min(diarization_segments, key=lambda x: min(abs(x[0].start - word_start), abs(x[0].end - word_start)))
|
96 |
-
speaker = closest_segment[2]
|
97 |
-
if speaker != current_speaker:
|
98 |
-
flush_current_segment()
|
99 |
-
current_speaker = speaker
|
100 |
-
current_text.append(word_text)
|
101 |
flush_current_segment()
|
102 |
|
103 |
-
#
|
104 |
merged_transcription = []
|
105 |
for speaker, text in speaker_transcription:
|
106 |
-
if not merged_transcription or merged_transcription[-1][0] != speaker:
|
107 |
merged_transcription.append((speaker, text))
|
108 |
else:
|
109 |
merged_transcription[-1] = (speaker, merged_transcription[-1][1] + " " + text)
|
|
|
49 |
|
50 |
|
51 |
|
52 |
+
def associate_speakers_with_timestamps(transcription_result, diarization, tolerance=0.02, min_segment_duration=0.05):
|
53 |
word_segments = transcription_result['chunks']
|
54 |
diarization_segments = list(diarization.itertracks(yield_label=True))
|
55 |
speaker_transcription = []
|
56 |
current_speaker = None
|
57 |
current_text = []
|
58 |
+
last_word_end = 0
|
|
|
59 |
|
60 |
def flush_current_segment():
|
61 |
nonlocal current_speaker, current_text
|
62 |
if current_speaker and current_text:
|
63 |
+
speaker_transcription.append((current_speaker, ' '.join(current_text)))
|
|
|
|
|
|
|
|
|
64 |
current_text = []
|
65 |
|
66 |
for word in word_segments:
|
67 |
word_start, word_end = word['timestamp']
|
68 |
word_text = word['text']
|
|
|
69 |
|
70 |
+
# Trouver le segment de diarisation correspondant
|
71 |
+
matching_segment = None
|
72 |
+
for segment, _, speaker in diarization_segments:
|
73 |
if segment.start - tolerance <= word_start < segment.end + tolerance:
|
74 |
+
matching_segment = (segment, speaker)
|
|
|
|
|
|
|
|
|
|
|
75 |
break
|
76 |
|
77 |
+
if matching_segment:
|
78 |
+
segment, speaker = matching_segment
|
79 |
+
if speaker != current_speaker:
|
80 |
+
flush_current_segment()
|
81 |
+
current_speaker = speaker
|
82 |
|
83 |
+
# Gérer les pauses longues
|
84 |
+
if word_start - last_word_end > 1.0: # Pause de plus d'une seconde
|
85 |
+
flush_current_segment()
|
86 |
+
|
87 |
+
current_text.append(word_text)
|
88 |
+
last_word_end = word_end
|
89 |
+
else:
|
90 |
+
# Si aucun segment ne correspond, attribuer au dernier locuteur connu
|
91 |
+
if current_speaker:
|
92 |
+
current_text.append(word_text)
|
93 |
+
else:
|
94 |
+
# Si c'est le premier mot sans correspondance, créer un nouveau segment
|
95 |
+
current_speaker = "SPEAKER_UNKNOWN"
|
96 |
+
current_text.append(word_text)
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
flush_current_segment()
|
99 |
|
100 |
+
# Fusionner les segments courts du même locuteur
|
101 |
merged_transcription = []
|
102 |
for speaker, text in speaker_transcription:
|
103 |
+
if not merged_transcription or merged_transcription[-1][0] != speaker or len(text.split()) > 3:
|
104 |
merged_transcription.append((speaker, text))
|
105 |
else:
|
106 |
merged_transcription[-1] = (speaker, merged_transcription[-1][1] + " " + text)
|