Spaces:

dwarkesh
/

transcript-aligner

Running

File size: 7,475 Bytes

import gradio as gr
import re
import difflib
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass

@dataclass
class Segment:
    """A segment of a transcript with speaker, timestamp, and text"""
    speaker: str
    timestamp: str
    text: str
    index: int  # Position in the original list

def extract_segments(transcript):
    """
    Extract segments from a transcript.
    Works with both formats:
    - Speaker LastName 00:00:00
    - **Speaker LastName** *00:00:00*
    """
    # This regex matches both markdown and plain text formats
    pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)"
    
    segments = []
    for i, match in enumerate(re.finditer(pattern, transcript, re.DOTALL)):
        speaker, timestamp, text = match.groups()
        segments.append(Segment(speaker, timestamp, text.strip(), i))
    
    return segments

def clean_text_for_matching(text):
    """Clean text for better matching between transcripts"""
    # Remove markdown links but keep the text
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    
    # Remove markdown formatting
    text = re.sub(r'\*\*|\*', '', text)
    
    # Remove punctuation and normalize whitespace
    text = re.sub(r'[,.;:!?()[\]{}]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text.lower().strip()

def find_best_matches(auto_segments, human_segments):
    """
    Find the best matching segments between auto and human transcripts.
    Uses text similarity to match segments.
    """
    matches = {}
    
    # Prepare cleaned texts for comparison
    auto_cleaned_texts = [clean_text_for_matching(seg.text) for seg in auto_segments]
    human_cleaned_texts = [clean_text_for_matching(seg.text) for seg in human_segments]
    
    # For each human segment, find the best matching auto segment
    for h_idx, h_text in enumerate(human_cleaned_texts):
        best_match = -1
        best_score = 0.6  # Minimum similarity threshold
        
        for a_idx, a_text in enumerate(auto_cleaned_texts):
            # Skip already matched segments
            if a_idx in matches.values():
                continue
            
            # Calculate similarity
            similarity = difflib.SequenceMatcher(None, h_text, a_text).ratio()
            
            # If this is the best match so far, record it
            if similarity > best_score:
                best_score = similarity
                best_match = a_idx
        
        # If we found a good match, record it
        if best_match != -1:
            matches[h_idx] = best_match
    
    return matches

def update_timestamps(human_transcript, auto_transcript):
    """
    Update timestamps in human transcript using timestamps from auto transcript.
    """
    # Extract segments from both transcripts
    human_segments = extract_segments(human_transcript)
    auto_segments = extract_segments(auto_transcript)
    
    if not human_segments or not auto_segments:
        return "Error: Could not parse transcripts. Check formatting.", ""
    
    # Find matching segments based on text similarity
    matches = find_best_matches(auto_segments, human_segments)
    
    # Create updated transcript with new timestamps
    updated_transcript = human_transcript
    
    # Replace timestamps in reverse order to avoid position shifts
    for h_idx in sorted(matches.keys(), reverse=True):
        a_idx = matches[h_idx]
        
        human_seg = human_segments[h_idx]
        auto_seg = auto_segments[a_idx]
        
        # Determine if markdown is used
        is_markdown = "**" in human_transcript
        
        # Create regex patterns to match the timestamp in the original text
        if is_markdown:
            pattern = fr"\*\*{human_seg.speaker}\*\*\s+\*{human_seg.timestamp}\*"
            replacement = f"**{human_seg.speaker}** *{auto_seg.timestamp}*"
        else:
            pattern = fr"{human_seg.speaker}\s+{human_seg.timestamp}"
            replacement = f"{human_seg.speaker} {auto_seg.timestamp}"
        
        # Replace the timestamp in the transcript
        updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)
    
    # Generate report
    match_count = len(matches)
    human_count = len(human_segments)
    auto_count = len(auto_segments)
    
    report = f"### Timestamp Update Report\n\n"
    report += f"- Human segments: {human_count}\n"
    report += f"- Auto segments: {auto_count}\n"
    report += f"- Matched segments with updated timestamps: {match_count} ({match_count/human_count*100:.1f}%)\n"
    
    if match_count < human_count:
        report += f"- Segments not updated: {human_count - match_count}\n"
    
    # Print some example matches for verification
    if matches:
        report += "\n### Example matches (for verification):\n\n"
        
        # Show up to 5 matches
        sample_matches = list(matches.items())[:5]
        for h_idx, a_idx in sample_matches:
            h_seg = human_segments[h_idx]
            a_seg = auto_segments[a_idx]
            
            # Truncate text samples for readability
            h_preview = h_seg.text[:50] + "..." if len(h_seg.text) > 50 else h_seg.text
            a_preview = a_seg.text[:50] + "..." if len(a_seg.text) > 50 else a_seg.text
            
            report += f"- {h_seg.speaker}: timestamp changed from `{h_seg.timestamp}` to `{a_seg.timestamp}`\n"
            report += f"  - Human: \"{h_preview}\"\n"
            report += f"  - Auto: \"{a_preview}\"\n\n"
    
    return updated_transcript, report

# Create Gradio interface
with gr.Blocks(title="Transcript Timestamp Updater") as demo:
    gr.Markdown("""
    # 🎙️ Transcript Timestamp Updater
    
    This tool updates timestamps in a human-edited transcript by taking correct timestamps from an auto-generated transcript.
    
    ## Instructions:
    1. Paste your auto-generated transcript (with correct timestamps)
    2. Paste your human-edited transcript (with old timestamps that need updating)
    3. Click "Update Timestamps"
    
    The tool will preserve all human edits and only update the timestamps.
    """)
    
    with gr.Row():
        with gr.Column():
            auto_transcript = gr.Textbox(
                label="Auto-Generated Transcript (with correct timestamps)",
                placeholder="Paste the auto-generated transcript here...",
                lines=15
            )
        
        with gr.Column():
            human_transcript = gr.Textbox(
                label="Human-Edited Transcript (timestamps need updating)",
                placeholder="Paste your human-edited transcript here...",
                lines=15
            )
    
    update_btn = gr.Button("Update Timestamps")
    
    with gr.Tabs():
        with gr.TabItem("Updated Transcript"):
            updated_transcript = gr.TextArea(
                label="Updated Transcript",
                placeholder="The updated transcript will appear here...",
                lines=20
            )
        
        with gr.TabItem("Report"):
            report = gr.Markdown(
                label="Matching Report",
                value="Report will appear here..."
            )
    
    update_btn.click(
        fn=update_timestamps,
        inputs=[human_transcript, auto_transcript],
        outputs=[updated_transcript, report]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()