Spaces:

dwarkesh
/

transcript-aligner

Sleeping

File size: 5,415 Bytes

import gradio as gr
import re
from typing import List, Dict, Tuple

def extract_segments(transcript):
    """
    Extract segments from a transcript.
    Returns a list of tuples: (speaker, timestamp, text)
    """
    pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)"
    segments = []
    
    for match in re.finditer(pattern, transcript, re.DOTALL):
        speaker, timestamp, text = match.groups()
        segments.append((speaker, timestamp, text.strip()))
    
    return segments

def find_matching_segments(auto_segments, human_segments):
    """
    Find matching segments between auto and human transcripts.
    Returns a dictionary mapping human segment index to auto segment index.
    
    Very simple matching based on speaker sequence - assumes both transcripts
    have the same speakers in the same order, just with different timestamps.
    """
    matches = {}
    
    # Group segments by speaker
    auto_by_speaker = {}
    for i, (speaker, _, _) in enumerate(auto_segments):
        if speaker not in auto_by_speaker:
            auto_by_speaker[speaker] = []
        auto_by_speaker[speaker].append(i)
    
    # Match segments by speaker order
    for h_idx, (speaker, _, _) in enumerate(human_segments):
        if speaker in auto_by_speaker and auto_by_speaker[speaker]:
            # Get the next available segment for this speaker
            matches[h_idx] = auto_by_speaker[speaker].pop(0)
    
    return matches

def update_timestamps(human_transcript, auto_transcript):
    """
    Update timestamps in human transcript using timestamps from auto transcript.
    Preserves all human edits and formatting.
    """
    # Extract segments from both transcripts
    human_segments = extract_segments(human_transcript)
    auto_segments = extract_segments(auto_transcript)
    
    if not human_segments or not auto_segments:
        return "Error: Could not parse transcripts. Check formatting.", ""
    
    # Find matching segments
    matches = find_matching_segments(auto_segments, human_segments)
    
    # Create updated transcript
    updated_transcript = human_transcript
    
    # Replace timestamps in reverse order to avoid position shifts
    for h_idx in sorted(matches.keys(), reverse=True):
        a_idx = matches[h_idx]
        
        h_speaker, h_timestamp, _ = human_segments[h_idx]
        _, a_timestamp, _ = auto_segments[a_idx]
        
        # Determine if markdown is used
        is_markdown = "**" in human_transcript
        
        # Create patterns to match the timestamp in the original text
        if is_markdown:
            # For markdown format: **Speaker** *00:00:00*
            pattern = fr"\*\*{h_speaker}\*\*\s+\*{h_timestamp}\*"
            replacement = f"**{h_speaker}** *{a_timestamp}*"
        else:
            # For plain format: Speaker 00:00:00
            pattern = fr"{h_speaker}\s+{h_timestamp}"
            replacement = f"{h_speaker} {a_timestamp}"
        
        # Replace the timestamp in the transcript
        updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)
    
    # Generate report
    report = f"### Timestamp Update Report\n\n"
    report += f"- Human segments: {len(human_segments)}\n"
    report += f"- Auto segments: {len(auto_segments)}\n"
    report += f"- Updated timestamps: {len(matches)}\n"
    
    if len(matches) < len(human_segments):
        unmatched = len(human_segments) - len(matches)
        report += f"- Segments not updated: {unmatched}\n"
    
    return updated_transcript, report

# Create Gradio interface
with gr.Blocks(title="Simple Transcript Timestamp Updater") as demo:
    gr.Markdown("""
    # 🎙️ Simple Transcript Timestamp Updater
    
    This tool updates timestamps in a human-edited transcript based on an auto-generated transcript.
    
    ## Instructions:
    1. Paste your auto-generated transcript (with correct timestamps)
    2. Paste your human-edited transcript (with old timestamps)
    3. Click "Update Timestamps"
    
    The tool will update only the timestamps while preserving all human edits.
    """)
    
    with gr.Row():
        with gr.Column():
            auto_transcript = gr.Textbox(
                label="Auto-Generated Transcript (with correct timestamps)",
                placeholder="Paste the auto-generated transcript here...",
                lines=15
            )
        
        with gr.Column():
            human_transcript = gr.Textbox(
                label="Human-Edited Transcript (with old timestamps)",
                placeholder="Paste your human-edited transcript here...",
                lines=15
            )
    
    update_btn = gr.Button("Update Timestamps")
    
    with gr.Tabs():
        with gr.TabItem("Updated Transcript"):
            updated_transcript = gr.TextArea(
                label="Updated Transcript",
                placeholder="The updated transcript will appear here...",
                lines=20
            )
        
        with gr.TabItem("Report"):
            report = gr.Markdown(
                label="Report",
                value="Report will appear here..."
            )
    
    update_btn.click(
        fn=update_timestamps,
        inputs=[human_transcript, auto_transcript],
        outputs=[updated_transcript, report]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()