Spaces:

dwarkesh
/

transcript-aligner

Running

App Files Files Community

dwarkesh commited on Apr 2

Commit

214a4d6

verified ·

1 Parent(s): 4322c44

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -139

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import gradio as gr
 import re
 import difflib
 from typing import List, Dict, Tuple, Optional
-import numpy as np
 from dataclasses import dataclass
 @dataclass
@@ -13,31 +12,10 @@ class Segment:
     text: str
     raw_text: str  # For matching purposes - original text without formatting
-@dataclass
-class Match:
-    """Represents a match between segments"""
-    auto_index: int
-    human_index: int
-    similarity: float
-def parse_auto_transcript(transcript: str) -> List[Segment]:
-    """Parse the auto-generated transcript"""
-    # Pattern to match "Speaker X 00:00:00" followed by text
-    pattern = r"(?:\*\*)?Speaker (\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?Speaker |\Z)"
-    segments = []
-    for match in re.finditer(pattern, transcript, re.DOTALL):
-        speaker, timestamp, text = match.groups()
-        # Remove any markdown formatting for matching purposes
-        raw_text = re.sub(r'\*\*|\*', '', text.strip())
-        segments.append(Segment(speaker, timestamp, text.strip(), raw_text))
-    return segments
-def parse_human_transcript(transcript: str) -> List[Segment]:
-    """Parse the human-edited transcript"""
-    # Pattern to match both markdown and plain text formats
-    # This handles both "**Speaker X** *00:00:00*" and "Speaker X 00:00:00"
     pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)"
     segments = []
@@ -49,186 +27,165 @@ def parse_human_transcript(transcript: str) -> List[Segment]:
     return segments
-def similarity_score(text1: str, text2: str) -> float:
-    """Calculate similarity between two text segments"""
     # Remove all markdown, punctuation, and lowercase for better matching
-    clean1 = re.sub(r'[^\w\s]', '', text1.lower())
-    clean2 = re.sub(r'[^\w\s]', '', text2.lower())
-    # Use difflib's SequenceMatcher for similarity
-    return difflib.SequenceMatcher(None, clean1, clean2).ratio()
-def find_best_matches(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Match]:
-    """Find the best matching segments between auto and human transcripts"""
     matches = []
-    used_human_indices = set()
-    # First pass: Find obvious matches (high similarity)
-    for auto_idx, auto_segment in enumerate(auto_segments):
-        best_match_idx = -1
-        best_similarity = 0.0
-        for human_idx, human_segment in enumerate(human_segments):
-            if human_idx in used_human_indices:
-                continue
-            similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text)
-            if similarity > best_similarity and similarity >= 0.6:  # Threshold for a good match
-                best_similarity = similarity
-                best_match_idx = human_idx
-        if best_match_idx >= 0:
-            matches.append(Match(auto_idx, best_match_idx, best_similarity))
-            used_human_indices.add(best_match_idx)
-    # Second pass: Try to match remaining segments with a lower threshold
-    for auto_idx, auto_segment in enumerate(auto_segments):
-        if any(m.auto_index == auto_idx for m in matches):
-            continue
         best_match_idx = -1
-        best_similarity = 0.0
-        for human_idx, human_segment in enumerate(human_segments):
-            if human_idx in used_human_indices:
                 continue
-            similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text)
-            if similarity > best_similarity and similarity >= 0.4:  # Lower threshold
                 best_similarity = similarity
-                best_match_idx = human_idx
         if best_match_idx >= 0:
-            matches.append(Match(auto_idx, best_match_idx, best_similarity))
-            used_human_indices.add(best_match_idx)
     return matches
-def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Match]) -> str:
     """Update timestamps in human transcript based on matches"""
-    # Create a new list for the updated segments
     updated_segments = human_segments.copy()
-    for match in matches:
-        auto_segment = auto_segments[match.auto_index]
-        human_segment = human_segments[match.human_index]
-        # Update the timestamp in the human segment
-        updated_segments[match.human_index] = Segment(
-            speaker=human_segment.speaker,
-            timestamp=auto_segment.timestamp,
-            text=human_segment.text,
-            raw_text=human_segment.raw_text
         )
     # Generate the updated transcript
     result = []
     for segment in updated_segments:
-        # Check if this is a markdown-formatted transcript
-        if "**" in human_segments[0].text or "*" in human_segments[0].timestamp:
             result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
         else:
             result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
     return "\n\n".join(result)
-def find_unmatched_segments(auto_segments: List[Segment], matches: List[Match]) -> List[int]:
-    """Find segments in the auto transcript that weren't matched"""
-    matched_auto_indices = {match.auto_index for match in matches}
     return [i for i in range(len(auto_segments)) if i not in matched_auto_indices]
-def format_unmatched_segments(auto_segments: List[Segment], unmatched_indices: List[int], is_markdown: bool) -> str:
-    """Format unmatched segments for display"""
-    if not unmatched_indices:
-        return "No unmatched segments found"
     result = []
-    for idx in unmatched_indices:
-        segment = auto_segments[idx]
         if is_markdown:
-            result.append(f"**Speaker {segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
         else:
             result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
-    return "### Unmatched Segments (New Content)\n\n" + "\n\n".join(result)
 def process_transcripts(auto_transcript: str, human_transcript: str):
     """Process transcripts and update timestamps"""
-    # Parse both transcripts
-    auto_segments = parse_auto_transcript(auto_transcript)
-    human_segments = parse_human_transcript(human_transcript)
-    # Early check for empty inputs
     if not auto_segments or not human_segments:
-        return "Error: Could not parse one or both transcripts. Please check the format.", "", ""
-    # Find matches between segments
-    matches = find_best_matches(auto_segments, human_segments)
     # Find unmatched segments
-    unmatched_indices = find_unmatched_segments(auto_segments, matches)
-    # Determine if we're using markdown
     is_markdown = "**" in human_transcript or "*" in human_transcript
     # Update timestamps
     updated_transcript = update_timestamps(auto_segments, human_segments, matches)
-    # Format unmatched segments
-    unmatched_segments = format_unmatched_segments(auto_segments, unmatched_indices, is_markdown)
-    # Stats about the matching
     stats = f"### Matching Statistics\n\n"
     stats += f"- Auto-generated segments: {len(auto_segments)}\n"
     stats += f"- Human-edited segments: {len(human_segments)}\n"
     stats += f"- Matched segments: {len(matches)}\n"
-    stats += f"- Unmatched segments: {len(unmatched_indices)}\n"
-    # Add match quality histogram
-    if matches:
-        similarities = [match.similarity for match in matches]
-        stats += f"- Average match similarity: {sum(similarities)/len(similarities):.2f}\n"
-        # Histogram of match qualities
-        bins = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
-        hist, _ = np.histogram(similarities, bins=bins)
-        stats += "\n#### Match Quality Distribution\n\n"
-        for i, count in enumerate(hist):
-            lower = bins[i]
-            upper = bins[i+1]
-            stats += f"- {lower:.1f}-{upper:.1f}: {count} matches\n"
-    return updated_transcript, unmatched_segments, stats
 # Create Gradio interface
 with gr.Blocks(title="Transcript Timestamp Updater") as demo:
     gr.Markdown("""
-    # Transcript Timestamp Updater
-    This tool updates timestamps in a human-edited transcript based on a new auto-generated transcript.
     ## Instructions:
     1. Paste your new auto-generated transcript (with updated timestamps)
     2. Paste your human-edited transcript (with old timestamps)
-    3. Click "Update Timestamps" to generate a new version of the human-edited transcript with updated timestamps
-    The tool will try to match segments between the two transcripts and update the timestamps accordingly.
     """)
     with gr.Row():
         with gr.Column():
-            auto_transcript = gr.Textbox(
-                label="New Auto-Generated Transcript (with updated timestamps)",
-                placeholder="Paste the new auto-generated transcript here...",
                 lines=15
             )
         with gr.Column():
-            human_transcript = gr.Textbox(
                 label="Human-Edited Transcript (with old timestamps)",
-                placeholder="Paste your human-edited transcript here...",
                 lines=15
             )
@@ -237,27 +194,21 @@ with gr.Blocks(title="Transcript Timestamp Updater") as demo:
     with gr.Tabs():
         with gr.TabItem("Updated Transcript"):
             updated_transcript = gr.TextArea(
-                label="Updated Human Transcript",
                 placeholder="The updated transcript will appear here...",
                 lines=20
             )
-        with gr.TabItem("Unmatched Segments"):
-            unmatched_segments = gr.Markdown(
-                label="Unmatched Segments",
-                value="Unmatched segments will appear here..."
-            )
         with gr.TabItem("Statistics"):
             stats = gr.Markdown(
-                label="Matching Statistics",
                 value="Statistics will appear here..."
             )
     update_btn.click(
         fn=process_transcripts,
         inputs=[auto_transcript, human_transcript],
-        outputs=[updated_transcript, unmatched_segments, stats]
     )
 # Launch the app

 import re
 import difflib
 from typing import List, Dict, Tuple, Optional
 from dataclasses import dataclass
 @dataclass
     text: str
     raw_text: str  # For matching purposes - original text without formatting
+def parse_transcript(transcript: str) -> List[Segment]:
+    """Parse a transcript into segments, handling both markdown and plain formats"""
+    # This pattern matches both markdown and plain text formats:
+    # - "**Speaker X** *00:00:00*" or "Speaker X 00:00:00"
     pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)"
     segments = []
     return segments
+def clean_text_for_comparison(text: str) -> str:
+    """Clean text for better comparison"""
     # Remove all markdown, punctuation, and lowercase for better matching
+    text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text)
+    text = re.sub(r'[^\w\s]', '', text.lower())
+    return text.strip()
+def match_segments(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Tuple[int, int]]:
+    """Match segments between auto and human transcripts using text similarity
+    Returns list of tuples (auto_index, human_index)"""
     matches = []
+    # Prepare clean versions of texts for comparison
+    auto_texts = [clean_text_for_comparison(seg.raw_text) for seg in auto_segments]
+    human_texts = [clean_text_for_comparison(seg.raw_text) for seg in human_segments]
+    # Try to match each human segment to an auto segment
+    for human_idx, human_text in enumerate(human_texts):
         best_match_idx = -1
+        best_similarity = 0
+        for auto_idx, auto_text in enumerate(auto_texts):
+            # Skip if this auto segment is already matched
+            if any(match[0] == auto_idx for match in matches):
                 continue
+            # Calculate similarity
+            similarity = difflib.SequenceMatcher(None, auto_text, human_text).ratio()
+            if similarity > best_similarity and similarity >= 0.6:  # Threshold
                 best_similarity = similarity
+                best_match_idx = auto_idx
         if best_match_idx >= 0:
+            matches.append((best_match_idx, human_idx))
     return matches
+def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Tuple[int, int]]) -> str:
     """Update timestamps in human transcript based on matches"""
     updated_segments = human_segments.copy()
+    # Update timestamps based on matches
+    for auto_idx, human_idx in matches:
+        # Keep the human-edited text, update only the timestamp
+        updated_segments[human_idx] = Segment(
+            speaker=human_segments[human_idx].speaker,
+            timestamp=auto_segments[auto_idx].timestamp,
+            text=human_segments[human_idx].text,
+            raw_text=human_segments[human_idx].raw_text
         )
+    # Determine if the human transcript uses markdown formatting
+    is_markdown = "**" in human_segments[0].text or "*" in human_segments[0].timestamp if human_segments else False
     # Generate the updated transcript
     result = []
     for segment in updated_segments:
+        if is_markdown:
             result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
         else:
             result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
     return "\n\n".join(result)
+def get_unmatched_auto_segments(auto_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
+    """Get indices of auto segments that weren't matched to any human segment"""
+    matched_auto_indices = {match[0] for match in matches}
     return [i for i in range(len(auto_segments)) if i not in matched_auto_indices]
+def get_unmatched_human_segments(human_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
+    """Get indices of human segments that weren't matched to any auto segment"""
+    matched_human_indices = {match[1] for match in matches}
+    return [i for i in range(len(human_segments)) if i not in matched_human_indices]
+def format_segments(segments: List[Segment], indices: List[int], is_markdown: bool) -> str:
+    """Format segments for display"""
+    if not indices:
+        return "None"
     result = []
+    for idx in indices:
+        segment = segments[idx]
         if is_markdown:
+            result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
         else:
             result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
+    return "\n\n".join(result)
 def process_transcripts(auto_transcript: str, human_transcript: str):
     """Process transcripts and update timestamps"""
+    # Parse transcripts
+    auto_segments = parse_transcript(auto_transcript)
+    human_segments = parse_transcript(human_transcript)
+    # Basic validation
     if not auto_segments or not human_segments:
+        return "Error: Could not parse transcripts. Check formatting.", "", ""
+    # Match segments
+    matches = match_segments(auto_segments, human_segments)
     # Find unmatched segments
+    unmatched_auto = get_unmatched_auto_segments(auto_segments, matches)
+    unmatched_human = get_unmatched_human_segments(human_segments, matches)
+    # Determine if the format uses markdown
     is_markdown = "**" in human_transcript or "*" in human_transcript
     # Update timestamps
     updated_transcript = update_timestamps(auto_segments, human_segments, matches)
+    # Format statistics
     stats = f"### Matching Statistics\n\n"
     stats += f"- Auto-generated segments: {len(auto_segments)}\n"
     stats += f"- Human-edited segments: {len(human_segments)}\n"
     stats += f"- Matched segments: {len(matches)}\n"
+    stats += f"- Unmatched auto segments (new content): {len(unmatched_auto)}\n"
+    stats += f"- Unmatched human segments (removed content): {len(unmatched_human)}\n"
+    # Format unmatched segments
+    if unmatched_auto:
+        stats += f"\n### New Content (In Auto-generated but not in Human-edited)\n\n"
+        stats += format_segments(auto_segments, unmatched_auto, is_markdown)
+    if unmatched_human:
+        stats += f"\n### Removed Content (In Human-edited but not in Auto-generated)\n\n"
+        stats += format_segments(human_segments, unmatched_human, is_markdown)
+    return updated_transcript, stats
 # Create Gradio interface
 with gr.Blocks(title="Transcript Timestamp Updater") as demo:
     gr.Markdown("""
+    # 🎙️ Transcript Timestamp Updater
+    This tool updates timestamps in human-edited transcripts based on auto-generated transcripts.
     ## Instructions:
     1. Paste your new auto-generated transcript (with updated timestamps)
     2. Paste your human-edited transcript (with old timestamps)
+    3. Click "Update Timestamps"
+    The tool will match segments between transcripts and update the timestamps while preserving all human edits.
     """)
     with gr.Row():
         with gr.Column():
+            auto_transcript = gr.TextArea(
+                label="Auto-Generated Transcript (with new timestamps)",
+                placeholder="Paste the auto-generated transcript here...",
                 lines=15
             )
         with gr.Column():
+            human_transcript = gr.TextArea(
                 label="Human-Edited Transcript (with old timestamps)",
+                placeholder="Paste the human-edited transcript here...",
                 lines=15
             )
     with gr.Tabs():
         with gr.TabItem("Updated Transcript"):
             updated_transcript = gr.TextArea(
+                label="Updated Transcript",
                 placeholder="The updated transcript will appear here...",
                 lines=20
             )
         with gr.TabItem("Statistics"):
             stats = gr.Markdown(
+                label="Statistics",
                 value="Statistics will appear here..."
             )
     update_btn.click(
         fn=process_transcripts,
         inputs=[auto_transcript, human_transcript],
+        outputs=[updated_transcript, stats]
     )
 # Launch the app