Spaces:

dwarkesh
/

transcript-aligner

Sleeping

File size: 8,739 Bytes

import gradio as gr
import re
import difflib
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass

@dataclass
class Segment:
    """Represents a transcript segment"""
    speaker: str
    timestamp: str
    text: str
    raw_text: str  # For matching purposes - original text without formatting

def parse_transcript(transcript: str) -> List[Segment]:
    """Parse a transcript into segments, handling both markdown and plain formats"""
    # This pattern matches both markdown and plain text formats:
    # - "**Speaker X** *00:00:00*" or "Speaker X 00:00:00"
    pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)"
    segments = []
    
    for match in re.finditer(pattern, transcript, re.DOTALL):
        speaker, timestamp, text = match.groups()
        # Remove any markdown formatting for matching purposes
        raw_text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text.strip())
        segments.append(Segment(speaker, timestamp, text.strip(), raw_text))
    
    return segments

def clean_text_for_comparison(text: str) -> str:
    """Clean text for better comparison"""
    # Remove all markdown, punctuation, and lowercase for better matching
    text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text.strip()

def match_segments(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Tuple[int, int]]:
    """Match segments between auto and human transcripts using text similarity
    Returns list of tuples (auto_index, human_index)"""
    matches = []
    
    # Prepare clean versions of texts for comparison
    auto_texts = [clean_text_for_comparison(seg.raw_text) for seg in auto_segments]
    human_texts = [clean_text_for_comparison(seg.raw_text) for seg in human_segments]
    
    # Try to match each human segment to an auto segment
    for human_idx, human_text in enumerate(human_texts):
        best_match_idx = -1
        best_similarity = 0
        
        for auto_idx, auto_text in enumerate(auto_texts):
            # Skip if this auto segment is already matched
            if any(match[0] == auto_idx for match in matches):
                continue
                
            # Calculate similarity
            similarity = difflib.SequenceMatcher(None, auto_text, human_text).ratio()
            
            if similarity > best_similarity and similarity >= 0.6:  # Threshold
                best_similarity = similarity
                best_match_idx = auto_idx
        
        if best_match_idx >= 0:
            matches.append((best_match_idx, human_idx))
    
    return matches

def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Tuple[int, int]]) -> str:
    """Update timestamps in human transcript based on matches"""
    updated_segments = human_segments.copy()
    
    # Update timestamps based on matches
    for auto_idx, human_idx in matches:
        # Keep the human-edited text, update only the timestamp
        updated_segments[human_idx] = Segment(
            speaker=human_segments[human_idx].speaker,
            timestamp=auto_segments[auto_idx].timestamp,
            text=human_segments[human_idx].text,
            raw_text=human_segments[human_idx].raw_text
        )
    
    # Determine if the human transcript uses markdown formatting
    is_markdown = "**" in human_segments[0].text or "*" in human_segments[0].timestamp if human_segments else False
    
    # Generate the updated transcript
    result = []
    for segment in updated_segments:
        if is_markdown:
            result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
        else:
            result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
    
    return "\n\n".join(result)

def get_unmatched_auto_segments(auto_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
    """Get indices of auto segments that weren't matched to any human segment"""
    matched_auto_indices = {match[0] for match in matches}
    return [i for i in range(len(auto_segments)) if i not in matched_auto_indices]

def get_unmatched_human_segments(human_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
    """Get indices of human segments that weren't matched to any auto segment"""
    matched_human_indices = {match[1] for match in matches}
    return [i for i in range(len(human_segments)) if i not in matched_human_indices]

def format_segments(segments: List[Segment], indices: List[int], is_markdown: bool) -> str:
    """Format segments for display"""
    if not indices:
        return "None"
    
    result = []
    for idx in indices:
        segment = segments[idx]
        if is_markdown:
            result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
        else:
            result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
    
    return "\n\n".join(result)

def process_transcripts(auto_transcript: str, human_transcript: str):
    """Process transcripts and update timestamps"""
    # Parse transcripts
    auto_segments = parse_transcript(auto_transcript)
    human_segments = parse_transcript(human_transcript)
    
    # Basic validation
    if not auto_segments or not human_segments:
        return "Error: Could not parse transcripts. Check formatting.", "", ""
    
    # Match segments
    matches = match_segments(auto_segments, human_segments)
    
    # Find unmatched segments
    unmatched_auto = get_unmatched_auto_segments(auto_segments, matches)
    unmatched_human = get_unmatched_human_segments(human_segments, matches)
    
    # Determine if the format uses markdown
    is_markdown = "**" in human_transcript or "*" in human_transcript
    
    # Update timestamps
    updated_transcript = update_timestamps(auto_segments, human_segments, matches)
    
    # Format statistics
    stats = f"### Matching Statistics\n\n"
    stats += f"- Auto-generated segments: {len(auto_segments)}\n"
    stats += f"- Human-edited segments: {len(human_segments)}\n"
    stats += f"- Matched segments: {len(matches)}\n"
    stats += f"- Unmatched auto segments (new content): {len(unmatched_auto)}\n"
    stats += f"- Unmatched human segments (removed content): {len(unmatched_human)}\n"
    
    # Format unmatched segments
    if unmatched_auto:
        stats += f"\n### New Content (In Auto-generated but not in Human-edited)\n\n"
        stats += format_segments(auto_segments, unmatched_auto, is_markdown)
    
    if unmatched_human:
        stats += f"\n### Removed Content (In Human-edited but not in Auto-generated)\n\n"
        stats += format_segments(human_segments, unmatched_human, is_markdown)
    
    return updated_transcript, stats

# Create Gradio interface
with gr.Blocks(title="Transcript Timestamp Updater") as demo:
    gr.Markdown("""
    # 🎙️ Transcript Timestamp Updater
    
    This tool updates timestamps in human-edited transcripts based on auto-generated transcripts.
    
    ## Instructions:
    1. Paste your new auto-generated transcript (with updated timestamps)
    2. Paste your human-edited transcript (with old timestamps)
    3. Click "Update Timestamps"
    
    The tool will match segments between transcripts and update the timestamps while preserving all human edits.
    """)
    
    with gr.Row():
        with gr.Column():
            auto_transcript = gr.TextArea(
                label="Auto-Generated Transcript (with new timestamps)",
                placeholder="Paste the auto-generated transcript here...",
                lines=15
            )
        
        with gr.Column():
            human_transcript = gr.TextArea(
                label="Human-Edited Transcript (with old timestamps)",
                placeholder="Paste the human-edited transcript here...",
                lines=15
            )
    
    update_btn = gr.Button("Update Timestamps")
    
    with gr.Tabs():
        with gr.TabItem("Updated Transcript"):
            updated_transcript = gr.TextArea(
                label="Updated Transcript",
                placeholder="The updated transcript will appear here...",
                lines=20
            )
        
        with gr.TabItem("Statistics"):
            stats = gr.Markdown(
                label="Statistics",
                value="Statistics will appear here..."
            )
    
    update_btn.click(
        fn=process_transcripts,
        inputs=[auto_transcript, human_transcript],
        outputs=[updated_transcript, stats]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()