File size: 5,415 Bytes
4322c44
 
5725925
4322c44
5725925
fb1eceb
5725925
 
fb1eceb
5725925
fb1eceb
5725925
 
4322c44
5725925
4322c44
 
 
5725925
fb1eceb
5725925
 
fb1eceb
5725925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4322c44
5725925
fb1eceb
5725925
 
fb1eceb
5725925
 
 
fb1eceb
5725925
 
fb1eceb
5725925
 
fb1eceb
5725925
 
fb1eceb
5725925
 
 
fb1eceb
5725925
 
fb1eceb
5725925
 
fb1eceb
5725925
 
 
 
 
 
 
 
 
fb1eceb
5725925
 
4322c44
5725925
 
 
 
 
fb1eceb
5725925
 
 
fb1eceb
5725925
4322c44
 
5725925
4322c44
5725925
4322c44
5725925
4322c44
 
5725925
 
 
4322c44
5725925
4322c44
 
 
 
5725925
 
214a4d6
5725925
4322c44
 
 
5725925
 
 
 
4322c44
 
5725925
4322c44
 
 
 
214a4d6
4322c44
 
 
 
5725925
 
 
 
4322c44
 
 
5725925
 
 
4322c44
 
5725925
4322c44
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gradio as gr
import re
from typing import List, Dict, Tuple

def extract_segments(transcript):
    """
    Extract segments from a transcript.
    Returns a list of tuples: (speaker, timestamp, text)
    """
    pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)"
    segments = []
    
    for match in re.finditer(pattern, transcript, re.DOTALL):
        speaker, timestamp, text = match.groups()
        segments.append((speaker, timestamp, text.strip()))
    
    return segments

def find_matching_segments(auto_segments, human_segments):
    """
    Find matching segments between auto and human transcripts.
    Returns a dictionary mapping human segment index to auto segment index.
    
    Very simple matching based on speaker sequence - assumes both transcripts
    have the same speakers in the same order, just with different timestamps.
    """
    matches = {}
    
    # Group segments by speaker
    auto_by_speaker = {}
    for i, (speaker, _, _) in enumerate(auto_segments):
        if speaker not in auto_by_speaker:
            auto_by_speaker[speaker] = []
        auto_by_speaker[speaker].append(i)
    
    # Match segments by speaker order
    for h_idx, (speaker, _, _) in enumerate(human_segments):
        if speaker in auto_by_speaker and auto_by_speaker[speaker]:
            # Get the next available segment for this speaker
            matches[h_idx] = auto_by_speaker[speaker].pop(0)
    
    return matches

def update_timestamps(human_transcript, auto_transcript):
    """
    Update timestamps in human transcript using timestamps from auto transcript.
    Preserves all human edits and formatting.
    """
    # Extract segments from both transcripts
    human_segments = extract_segments(human_transcript)
    auto_segments = extract_segments(auto_transcript)
    
    if not human_segments or not auto_segments:
        return "Error: Could not parse transcripts. Check formatting.", ""
    
    # Find matching segments
    matches = find_matching_segments(auto_segments, human_segments)
    
    # Create updated transcript
    updated_transcript = human_transcript
    
    # Replace timestamps in reverse order to avoid position shifts
    for h_idx in sorted(matches.keys(), reverse=True):
        a_idx = matches[h_idx]
        
        h_speaker, h_timestamp, _ = human_segments[h_idx]
        _, a_timestamp, _ = auto_segments[a_idx]
        
        # Determine if markdown is used
        is_markdown = "**" in human_transcript
        
        # Create patterns to match the timestamp in the original text
        if is_markdown:
            # For markdown format: **Speaker** *00:00:00*
            pattern = fr"\*\*{h_speaker}\*\*\s+\*{h_timestamp}\*"
            replacement = f"**{h_speaker}** *{a_timestamp}*"
        else:
            # For plain format: Speaker 00:00:00
            pattern = fr"{h_speaker}\s+{h_timestamp}"
            replacement = f"{h_speaker} {a_timestamp}"
        
        # Replace the timestamp in the transcript
        updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)
    
    # Generate report
    report = f"### Timestamp Update Report\n\n"
    report += f"- Human segments: {len(human_segments)}\n"
    report += f"- Auto segments: {len(auto_segments)}\n"
    report += f"- Updated timestamps: {len(matches)}\n"
    
    if len(matches) < len(human_segments):
        unmatched = len(human_segments) - len(matches)
        report += f"- Segments not updated: {unmatched}\n"
    
    return updated_transcript, report

# Create Gradio interface
with gr.Blocks(title="Simple Transcript Timestamp Updater") as demo:
    gr.Markdown("""
    # ๐ŸŽ™๏ธ Simple Transcript Timestamp Updater
    
    This tool updates timestamps in a human-edited transcript based on an auto-generated transcript.
    
    ## Instructions:
    1. Paste your auto-generated transcript (with correct timestamps)
    2. Paste your human-edited transcript (with old timestamps)
    3. Click "Update Timestamps"
    
    The tool will update only the timestamps while preserving all human edits.
    """)
    
    with gr.Row():
        with gr.Column():
            auto_transcript = gr.Textbox(
                label="Auto-Generated Transcript (with correct timestamps)",
                placeholder="Paste the auto-generated transcript here...",
                lines=15
            )
        
        with gr.Column():
            human_transcript = gr.Textbox(
                label="Human-Edited Transcript (with old timestamps)",
                placeholder="Paste your human-edited transcript here...",
                lines=15
            )
    
    update_btn = gr.Button("Update Timestamps")
    
    with gr.Tabs():
        with gr.TabItem("Updated Transcript"):
            updated_transcript = gr.TextArea(
                label="Updated Transcript",
                placeholder="The updated transcript will appear here...",
                lines=20
            )
        
        with gr.TabItem("Report"):
            report = gr.Markdown(
                label="Report",
                value="Report will appear here..."
            )
    
    update_btn.click(
        fn=update_timestamps,
        inputs=[human_transcript, auto_transcript],
        outputs=[updated_transcript, report]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()