Spaces:

dwarkesh
/

transcript-aligner

Running

App Files Files Community

dwarkesh commited on 18 days ago

Commit

4322c44

verified ·

1 Parent(s): c0cc6fd

Create app.py

Browse files

Files changed (1) hide show

app.py +265 -0

app.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import gradio as gr
+import re
+import difflib
+from typing import List, Dict, Tuple, Optional
+import numpy as np
+from dataclasses import dataclass
+@dataclass
+class Segment:
+    """Represents a transcript segment"""
+    speaker: str
+    timestamp: str
+    text: str
+    raw_text: str  # For matching purposes - original text without formatting
+@dataclass
+class Match:
+    """Represents a match between segments"""
+    auto_index: int
+    human_index: int
+    similarity: float
+def parse_auto_transcript(transcript: str) -> List[Segment]:
+    """Parse the auto-generated transcript"""
+    # Pattern to match "Speaker X 00:00:00" followed by text
+    pattern = r"(?:\*\*)?Speaker (\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?Speaker |\Z)"
+    segments = []
+    for match in re.finditer(pattern, transcript, re.DOTALL):
+        speaker, timestamp, text = match.groups()
+        # Remove any markdown formatting for matching purposes
+        raw_text = re.sub(r'\*\*|\*', '', text.strip())
+        segments.append(Segment(speaker, timestamp, text.strip(), raw_text))
+    return segments
+def parse_human_transcript(transcript: str) -> List[Segment]:
+    """Parse the human-edited transcript"""
+    # Pattern to match both markdown and plain text formats
+    # This handles both "**Speaker X** *00:00:00*" and "Speaker X 00:00:00"
+    pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)"
+    segments = []
+    for match in re.finditer(pattern, transcript, re.DOTALL):
+        speaker, timestamp, text = match.groups()
+        # Remove any markdown formatting for matching purposes
+        raw_text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text.strip())
+        segments.append(Segment(speaker, timestamp, text.strip(), raw_text))
+    return segments
+def similarity_score(text1: str, text2: str) -> float:
+    """Calculate similarity between two text segments"""
+    # Remove all markdown, punctuation, and lowercase for better matching
+    clean1 = re.sub(r'[^\w\s]', '', text1.lower())
+    clean2 = re.sub(r'[^\w\s]', '', text2.lower())
+    # Use difflib's SequenceMatcher for similarity
+    return difflib.SequenceMatcher(None, clean1, clean2).ratio()
+def find_best_matches(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Match]:
+    """Find the best matching segments between auto and human transcripts"""
+    matches = []
+    used_human_indices = set()
+    # First pass: Find obvious matches (high similarity)
+    for auto_idx, auto_segment in enumerate(auto_segments):
+        best_match_idx = -1
+        best_similarity = 0.0
+        for human_idx, human_segment in enumerate(human_segments):
+            if human_idx in used_human_indices:
+                continue
+            similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text)
+            if similarity > best_similarity and similarity >= 0.6:  # Threshold for a good match
+                best_similarity = similarity
+                best_match_idx = human_idx
+        if best_match_idx >= 0:
+            matches.append(Match(auto_idx, best_match_idx, best_similarity))
+            used_human_indices.add(best_match_idx)
+    # Second pass: Try to match remaining segments with a lower threshold
+    for auto_idx, auto_segment in enumerate(auto_segments):
+        if any(m.auto_index == auto_idx for m in matches):
+            continue
+        best_match_idx = -1
+        best_similarity = 0.0
+        for human_idx, human_segment in enumerate(human_segments):
+            if human_idx in used_human_indices:
+                continue
+            similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text)
+            if similarity > best_similarity and similarity >= 0.4:  # Lower threshold
+                best_similarity = similarity
+                best_match_idx = human_idx
+        if best_match_idx >= 0:
+            matches.append(Match(auto_idx, best_match_idx, best_similarity))
+            used_human_indices.add(best_match_idx)
+    return matches
+def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Match]) -> str:
+    """Update timestamps in human transcript based on matches"""
+    # Create a new list for the updated segments
+    updated_segments = human_segments.copy()
+    for match in matches:
+        auto_segment = auto_segments[match.auto_index]
+        human_segment = human_segments[match.human_index]
+        # Update the timestamp in the human segment
+        updated_segments[match.human_index] = Segment(
+            speaker=human_segment.speaker,
+            timestamp=auto_segment.timestamp,
+            text=human_segment.text,
+            raw_text=human_segment.raw_text
+        )
+    # Generate the updated transcript
+    result = []
+    for segment in updated_segments:
+        # Check if this is a markdown-formatted transcript
+        if "**" in human_segments[0].text or "*" in human_segments[0].timestamp:
+            result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
+        else:
+            result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
+    return "\n\n".join(result)
+def find_unmatched_segments(auto_segments: List[Segment], matches: List[Match]) -> List[int]:
+    """Find segments in the auto transcript that weren't matched"""
+    matched_auto_indices = {match.auto_index for match in matches}
+    return [i for i in range(len(auto_segments)) if i not in matched_auto_indices]
+def format_unmatched_segments(auto_segments: List[Segment], unmatched_indices: List[int], is_markdown: bool) -> str:
+    """Format unmatched segments for display"""
+    if not unmatched_indices:
+        return "No unmatched segments found"
+    result = []
+    for idx in unmatched_indices:
+        segment = auto_segments[idx]
+        if is_markdown:
+            result.append(f"**Speaker {segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
+        else:
+            result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
+    return "### Unmatched Segments (New Content)\n\n" + "\n\n".join(result)
+def process_transcripts(auto_transcript: str, human_transcript: str):
+    """Process transcripts and update timestamps"""
+    # Parse both transcripts
+    auto_segments = parse_auto_transcript(auto_transcript)
+    human_segments = parse_human_transcript(human_transcript)
+    # Early check for empty inputs
+    if not auto_segments or not human_segments:
+        return "Error: Could not parse one or both transcripts. Please check the format.", "", ""
+    # Find matches between segments
+    matches = find_best_matches(auto_segments, human_segments)
+    # Find unmatched segments
+    unmatched_indices = find_unmatched_segments(auto_segments, matches)
+    # Determine if we're using markdown
+    is_markdown = "**" in human_transcript or "*" in human_transcript
+    # Update timestamps
+    updated_transcript = update_timestamps(auto_segments, human_segments, matches)
+    # Format unmatched segments
+    unmatched_segments = format_unmatched_segments(auto_segments, unmatched_indices, is_markdown)
+    # Stats about the matching
+    stats = f"### Matching Statistics\n\n"
+    stats += f"- Auto-generated segments: {len(auto_segments)}\n"
+    stats += f"- Human-edited segments: {len(human_segments)}\n"
+    stats += f"- Matched segments: {len(matches)}\n"
+    stats += f"- Unmatched segments: {len(unmatched_indices)}\n"
+    # Add match quality histogram
+    if matches:
+        similarities = [match.similarity for match in matches]
+        stats += f"- Average match similarity: {sum(similarities)/len(similarities):.2f}\n"
+        # Histogram of match qualities
+        bins = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
+        hist, _ = np.histogram(similarities, bins=bins)
+        stats += "\n#### Match Quality Distribution\n\n"
+        for i, count in enumerate(hist):
+            lower = bins[i]
+            upper = bins[i+1]
+            stats += f"- {lower:.1f}-{upper:.1f}: {count} matches\n"
+    return updated_transcript, unmatched_segments, stats
+# Create Gradio interface
+with gr.Blocks(title="Transcript Timestamp Updater") as demo:
+    gr.Markdown("""
+    # Transcript Timestamp Updater
+    This tool updates timestamps in a human-edited transcript based on a new auto-generated transcript.
+    ## Instructions:
+    1. Paste your new auto-generated transcript (with updated timestamps)
+    2. Paste your human-edited transcript (with old timestamps)
+    3. Click "Update Timestamps" to generate a new version of the human-edited transcript with updated timestamps
+    The tool will try to match segments between the two transcripts and update the timestamps accordingly.
+    """)
+    with gr.Row():
+        with gr.Column():
+            auto_transcript = gr.Textbox(
+                label="New Auto-Generated Transcript (with updated timestamps)",
+                placeholder="Paste the new auto-generated transcript here...",
+                lines=15
+            )
+        with gr.Column():
+            human_transcript = gr.Textbox(
+                label="Human-Edited Transcript (with old timestamps)",
+                placeholder="Paste your human-edited transcript here...",
+                lines=15
+            )
+    update_btn = gr.Button("Update Timestamps")
+    with gr.Tabs():
+        with gr.TabItem("Updated Transcript"):
+            updated_transcript = gr.TextArea(
+                label="Updated Human Transcript",
+                placeholder="The updated transcript will appear here...",
+                lines=20
+            )
+        with gr.TabItem("Unmatched Segments"):
+            unmatched_segments = gr.Markdown(
+                label="Unmatched Segments",
+                value="Unmatched segments will appear here..."
+            )
+        with gr.TabItem("Statistics"):
+            stats = gr.Markdown(
+                label="Matching Statistics",
+                value="Statistics will appear here..."
+            )
+    update_btn.click(
+        fn=process_transcripts,
+        inputs=[auto_transcript, human_transcript],
+        outputs=[updated_transcript, unmatched_segments, stats]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()