File size: 7,475 Bytes
4322c44
 
1f131a4
 
 
 
 
 
 
 
 
 
 
4322c44
5725925
fb1eceb
5725925
1f131a4
 
 
fb1eceb
1f131a4
5725925
 
1f131a4
 
4322c44
1f131a4
4322c44
 
 
1f131a4
 
 
 
fb1eceb
1f131a4
 
 
 
 
 
 
 
 
 
 
 
 
5725925
 
 
1f131a4
 
 
5725925
1f131a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5725925
 
4322c44
5725925
fb1eceb
5725925
fb1eceb
5725925
 
 
fb1eceb
5725925
 
fb1eceb
1f131a4
 
fb1eceb
1f131a4
5725925
fb1eceb
5725925
 
 
fb1eceb
1f131a4
 
fb1eceb
5725925
 
fb1eceb
1f131a4
5725925
1f131a4
 
5725925
1f131a4
 
fb1eceb
5725925
 
4322c44
5725925
1f131a4
 
 
 
5725925
1f131a4
 
 
 
 
 
fb1eceb
1f131a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb1eceb
5725925
4322c44
 
1f131a4
4322c44
1f131a4
4322c44
1f131a4
4322c44
 
5725925
1f131a4
5725925
4322c44
1f131a4
4322c44
 
 
 
5725925
 
214a4d6
5725925
4322c44
 
 
5725925
1f131a4
5725925
 
4322c44
 
5725925
4322c44
 
 
 
214a4d6
4322c44
 
 
 
5725925
 
1f131a4
5725925
4322c44
 
 
5725925
 
 
4322c44
 
5725925
4322c44
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import gradio as gr
import re
import difflib
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass

@dataclass
class Segment:
    """A segment of a transcript with speaker, timestamp, and text"""
    speaker: str
    timestamp: str
    text: str
    index: int  # Position in the original list

def extract_segments(transcript):
    """
    Extract segments from a transcript.
    Works with both formats:
    - Speaker LastName 00:00:00
    - **Speaker LastName** *00:00:00*
    """
    # This regex matches both markdown and plain text formats
    pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)"
    
    segments = []
    for i, match in enumerate(re.finditer(pattern, transcript, re.DOTALL)):
        speaker, timestamp, text = match.groups()
        segments.append(Segment(speaker, timestamp, text.strip(), i))
    
    return segments

def clean_text_for_matching(text):
    """Clean text for better matching between transcripts"""
    # Remove markdown links but keep the text
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    
    # Remove markdown formatting
    text = re.sub(r'\*\*|\*', '', text)
    
    # Remove punctuation and normalize whitespace
    text = re.sub(r'[,.;:!?()[\]{}]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text.lower().strip()

def find_best_matches(auto_segments, human_segments):
    """
    Find the best matching segments between auto and human transcripts.
    Uses text similarity to match segments.
    """
    matches = {}
    
    # Prepare cleaned texts for comparison
    auto_cleaned_texts = [clean_text_for_matching(seg.text) for seg in auto_segments]
    human_cleaned_texts = [clean_text_for_matching(seg.text) for seg in human_segments]
    
    # For each human segment, find the best matching auto segment
    for h_idx, h_text in enumerate(human_cleaned_texts):
        best_match = -1
        best_score = 0.6  # Minimum similarity threshold
        
        for a_idx, a_text in enumerate(auto_cleaned_texts):
            # Skip already matched segments
            if a_idx in matches.values():
                continue
            
            # Calculate similarity
            similarity = difflib.SequenceMatcher(None, h_text, a_text).ratio()
            
            # If this is the best match so far, record it
            if similarity > best_score:
                best_score = similarity
                best_match = a_idx
        
        # If we found a good match, record it
        if best_match != -1:
            matches[h_idx] = best_match
    
    return matches

def update_timestamps(human_transcript, auto_transcript):
    """
    Update timestamps in human transcript using timestamps from auto transcript.
    """
    # Extract segments from both transcripts
    human_segments = extract_segments(human_transcript)
    auto_segments = extract_segments(auto_transcript)
    
    if not human_segments or not auto_segments:
        return "Error: Could not parse transcripts. Check formatting.", ""
    
    # Find matching segments based on text similarity
    matches = find_best_matches(auto_segments, human_segments)
    
    # Create updated transcript with new timestamps
    updated_transcript = human_transcript
    
    # Replace timestamps in reverse order to avoid position shifts
    for h_idx in sorted(matches.keys(), reverse=True):
        a_idx = matches[h_idx]
        
        human_seg = human_segments[h_idx]
        auto_seg = auto_segments[a_idx]
        
        # Determine if markdown is used
        is_markdown = "**" in human_transcript
        
        # Create regex patterns to match the timestamp in the original text
        if is_markdown:
            pattern = fr"\*\*{human_seg.speaker}\*\*\s+\*{human_seg.timestamp}\*"
            replacement = f"**{human_seg.speaker}** *{auto_seg.timestamp}*"
        else:
            pattern = fr"{human_seg.speaker}\s+{human_seg.timestamp}"
            replacement = f"{human_seg.speaker} {auto_seg.timestamp}"
        
        # Replace the timestamp in the transcript
        updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)
    
    # Generate report
    match_count = len(matches)
    human_count = len(human_segments)
    auto_count = len(auto_segments)
    
    report = f"### Timestamp Update Report\n\n"
    report += f"- Human segments: {human_count}\n"
    report += f"- Auto segments: {auto_count}\n"
    report += f"- Matched segments with updated timestamps: {match_count} ({match_count/human_count*100:.1f}%)\n"
    
    if match_count < human_count:
        report += f"- Segments not updated: {human_count - match_count}\n"
    
    # Print some example matches for verification
    if matches:
        report += "\n### Example matches (for verification):\n\n"
        
        # Show up to 5 matches
        sample_matches = list(matches.items())[:5]
        for h_idx, a_idx in sample_matches:
            h_seg = human_segments[h_idx]
            a_seg = auto_segments[a_idx]
            
            # Truncate text samples for readability
            h_preview = h_seg.text[:50] + "..." if len(h_seg.text) > 50 else h_seg.text
            a_preview = a_seg.text[:50] + "..." if len(a_seg.text) > 50 else a_seg.text
            
            report += f"- {h_seg.speaker}: timestamp changed from `{h_seg.timestamp}` to `{a_seg.timestamp}`\n"
            report += f"  - Human: \"{h_preview}\"\n"
            report += f"  - Auto: \"{a_preview}\"\n\n"
    
    return updated_transcript, report

# Create Gradio interface
with gr.Blocks(title="Transcript Timestamp Updater") as demo:
    gr.Markdown("""
    # ๐ŸŽ™๏ธ Transcript Timestamp Updater
    
    This tool updates timestamps in a human-edited transcript by taking correct timestamps from an auto-generated transcript.
    
    ## Instructions:
    1. Paste your auto-generated transcript (with correct timestamps)
    2. Paste your human-edited transcript (with old timestamps that need updating)
    3. Click "Update Timestamps"
    
    The tool will preserve all human edits and only update the timestamps.
    """)
    
    with gr.Row():
        with gr.Column():
            auto_transcript = gr.Textbox(
                label="Auto-Generated Transcript (with correct timestamps)",
                placeholder="Paste the auto-generated transcript here...",
                lines=15
            )
        
        with gr.Column():
            human_transcript = gr.Textbox(
                label="Human-Edited Transcript (timestamps need updating)",
                placeholder="Paste your human-edited transcript here...",
                lines=15
            )
    
    update_btn = gr.Button("Update Timestamps")
    
    with gr.Tabs():
        with gr.TabItem("Updated Transcript"):
            updated_transcript = gr.TextArea(
                label="Updated Transcript",
                placeholder="The updated transcript will appear here...",
                lines=20
            )
        
        with gr.TabItem("Report"):
            report = gr.Markdown(
                label="Matching Report",
                value="Report will appear here..."
            )
    
    update_btn.click(
        fn=update_timestamps,
        inputs=[human_transcript, auto_transcript],
        outputs=[updated_transcript, report]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()