Spaces:

dwarkesh
/

transcript-aligner

Running

App Files Files Community

dwarkesh commited on Apr 2

Commit

1f131a4

verified ·

1 Parent(s): 5725925

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -48

app.py CHANGED Viewed

@@ -1,50 +1,86 @@
 import gradio as gr
 import re
-from typing import List, Dict, Tuple
 def extract_segments(transcript):
     """
     Extract segments from a transcript.
-    Returns a list of tuples: (speaker, timestamp, text)
     """
     pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)"
-    segments = []
-    for match in re.finditer(pattern, transcript, re.DOTALL):
         speaker, timestamp, text = match.groups()
-        segments.append((speaker, timestamp, text.strip()))
     return segments
-def find_matching_segments(auto_segments, human_segments):
-    """
-    Find matching segments between auto and human transcripts.
-    Returns a dictionary mapping human segment index to auto segment index.
-    Very simple matching based on speaker sequence - assumes both transcripts
-    have the same speakers in the same order, just with different timestamps.
     """
     matches = {}
-    # Group segments by speaker
-    auto_by_speaker = {}
-    for i, (speaker, _, _) in enumerate(auto_segments):
-        if speaker not in auto_by_speaker:
-            auto_by_speaker[speaker] = []
-        auto_by_speaker[speaker].append(i)
-    # Match segments by speaker order
-    for h_idx, (speaker, _, _) in enumerate(human_segments):
-        if speaker in auto_by_speaker and auto_by_speaker[speaker]:
-            # Get the next available segment for this speaker
-            matches[h_idx] = auto_by_speaker[speaker].pop(0)
     return matches
 def update_timestamps(human_transcript, auto_transcript):
     """
     Update timestamps in human transcript using timestamps from auto transcript.
-    Preserves all human edits and formatting.
     """
     # Extract segments from both transcripts
     human_segments = extract_segments(human_transcript)
@@ -53,60 +89,79 @@ def update_timestamps(human_transcript, auto_transcript):
     if not human_segments or not auto_segments:
         return "Error: Could not parse transcripts. Check formatting.", ""
-    # Find matching segments
-    matches = find_matching_segments(auto_segments, human_segments)
-    # Create updated transcript
     updated_transcript = human_transcript
     # Replace timestamps in reverse order to avoid position shifts
     for h_idx in sorted(matches.keys(), reverse=True):
         a_idx = matches[h_idx]
-        h_speaker, h_timestamp, _ = human_segments[h_idx]
-        _, a_timestamp, _ = auto_segments[a_idx]
         # Determine if markdown is used
         is_markdown = "**" in human_transcript
-        # Create patterns to match the timestamp in the original text
         if is_markdown:
-            # For markdown format: **Speaker** *00:00:00*
-            pattern = fr"\*\*{h_speaker}\*\*\s+\*{h_timestamp}\*"
-            replacement = f"**{h_speaker}** *{a_timestamp}*"
         else:
-            # For plain format: Speaker 00:00:00
-            pattern = fr"{h_speaker}\s+{h_timestamp}"
-            replacement = f"{h_speaker} {a_timestamp}"
         # Replace the timestamp in the transcript
         updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)
     # Generate report
     report = f"### Timestamp Update Report\n\n"
-    report += f"- Human segments: {len(human_segments)}\n"
-    report += f"- Auto segments: {len(auto_segments)}\n"
-    report += f"- Updated timestamps: {len(matches)}\n"
-    if len(matches) < len(human_segments):
-        unmatched = len(human_segments) - len(matches)
-        report += f"- Segments not updated: {unmatched}\n"
     return updated_transcript, report
 # Create Gradio interface
-with gr.Blocks(title="Simple Transcript Timestamp Updater") as demo:
     gr.Markdown("""
-    # 🎙️ Simple Transcript Timestamp Updater
-    This tool updates timestamps in a human-edited transcript based on an auto-generated transcript.
     ## Instructions:
     1. Paste your auto-generated transcript (with correct timestamps)
-    2. Paste your human-edited transcript (with old timestamps)
     3. Click "Update Timestamps"
-    The tool will update only the timestamps while preserving all human edits.
     """)
     with gr.Row():
@@ -119,7 +174,7 @@ with gr.Blocks(title="Simple Transcript Timestamp Updater") as demo:
         with gr.Column():
             human_transcript = gr.Textbox(
-                label="Human-Edited Transcript (with old timestamps)",
                 placeholder="Paste your human-edited transcript here...",
                 lines=15
             )
@@ -136,7 +191,7 @@ with gr.Blocks(title="Simple Transcript Timestamp Updater") as demo:
         with gr.TabItem("Report"):
             report = gr.Markdown(
-                label="Report",
                 value="Report will appear here..."
             )

 import gradio as gr
 import re
+import difflib
+from typing import List, Dict, Tuple, Optional
+from dataclasses import dataclass
+@dataclass
+class Segment:
+    """A segment of a transcript with speaker, timestamp, and text"""
+    speaker: str
+    timestamp: str
+    text: str
+    index: int  # Position in the original list
 def extract_segments(transcript):
     """
     Extract segments from a transcript.
+    Works with both formats:
+    - Speaker LastName 00:00:00
+    - **Speaker LastName** *00:00:00*
     """
+    # This regex matches both markdown and plain text formats
     pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)"
+    segments = []
+    for i, match in enumerate(re.finditer(pattern, transcript, re.DOTALL)):
         speaker, timestamp, text = match.groups()
+        segments.append(Segment(speaker, timestamp, text.strip(), i))
     return segments
+def clean_text_for_matching(text):
+    """Clean text for better matching between transcripts"""
+    # Remove markdown links but keep the text
+    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
+    # Remove markdown formatting
+    text = re.sub(r'\*\*|\*', '', text)
+    # Remove punctuation and normalize whitespace
+    text = re.sub(r'[,.;:!?()[\]{}]', ' ', text)
+    text = re.sub(r'\s+', ' ', text)
+    return text.lower().strip()
+def find_best_matches(auto_segments, human_segments):
+    """
+    Find the best matching segments between auto and human transcripts.
+    Uses text similarity to match segments.
     """
     matches = {}
+    # Prepare cleaned texts for comparison
+    auto_cleaned_texts = [clean_text_for_matching(seg.text) for seg in auto_segments]
+    human_cleaned_texts = [clean_text_for_matching(seg.text) for seg in human_segments]
+    # For each human segment, find the best matching auto segment
+    for h_idx, h_text in enumerate(human_cleaned_texts):
+        best_match = -1
+        best_score = 0.6  # Minimum similarity threshold
+        for a_idx, a_text in enumerate(auto_cleaned_texts):
+            # Skip already matched segments
+            if a_idx in matches.values():
+                continue
+            # Calculate similarity
+            similarity = difflib.SequenceMatcher(None, h_text, a_text).ratio()
+            # If this is the best match so far, record it
+            if similarity > best_score:
+                best_score = similarity
+                best_match = a_idx
+        # If we found a good match, record it
+        if best_match != -1:
+            matches[h_idx] = best_match
     return matches
 def update_timestamps(human_transcript, auto_transcript):
     """
     Update timestamps in human transcript using timestamps from auto transcript.
     """
     # Extract segments from both transcripts
     human_segments = extract_segments(human_transcript)
     if not human_segments or not auto_segments:
         return "Error: Could not parse transcripts. Check formatting.", ""
+    # Find matching segments based on text similarity
+    matches = find_best_matches(auto_segments, human_segments)
+    # Create updated transcript with new timestamps
     updated_transcript = human_transcript
     # Replace timestamps in reverse order to avoid position shifts
     for h_idx in sorted(matches.keys(), reverse=True):
         a_idx = matches[h_idx]
+        human_seg = human_segments[h_idx]
+        auto_seg = auto_segments[a_idx]
         # Determine if markdown is used
         is_markdown = "**" in human_transcript
+        # Create regex patterns to match the timestamp in the original text
         if is_markdown:
+            pattern = fr"\*\*{human_seg.speaker}\*\*\s+\*{human_seg.timestamp}\*"
+            replacement = f"**{human_seg.speaker}** *{auto_seg.timestamp}*"
         else:
+            pattern = fr"{human_seg.speaker}\s+{human_seg.timestamp}"
+            replacement = f"{human_seg.speaker} {auto_seg.timestamp}"
         # Replace the timestamp in the transcript
         updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)
     # Generate report
+    match_count = len(matches)
+    human_count = len(human_segments)
+    auto_count = len(auto_segments)
     report = f"### Timestamp Update Report\n\n"
+    report += f"- Human segments: {human_count}\n"
+    report += f"- Auto segments: {auto_count}\n"
+    report += f"- Matched segments with updated timestamps: {match_count} ({match_count/human_count*100:.1f}%)\n"
+    if match_count < human_count:
+        report += f"- Segments not updated: {human_count - match_count}\n"
+    # Print some example matches for verification
+    if matches:
+        report += "\n### Example matches (for verification):\n\n"
+        # Show up to 5 matches
+        sample_matches = list(matches.items())[:5]
+        for h_idx, a_idx in sample_matches:
+            h_seg = human_segments[h_idx]
+            a_seg = auto_segments[a_idx]
+            # Truncate text samples for readability
+            h_preview = h_seg.text[:50] + "..." if len(h_seg.text) > 50 else h_seg.text
+            a_preview = a_seg.text[:50] + "..." if len(a_seg.text) > 50 else a_seg.text
+            report += f"- {h_seg.speaker}: timestamp changed from `{h_seg.timestamp}` to `{a_seg.timestamp}`\n"
+            report += f"  - Human: \"{h_preview}\"\n"
+            report += f"  - Auto: \"{a_preview}\"\n\n"
     return updated_transcript, report
 # Create Gradio interface
+with gr.Blocks(title="Transcript Timestamp Updater") as demo:
     gr.Markdown("""
+    # 🎙️ Transcript Timestamp Updater
+    This tool updates timestamps in a human-edited transcript by taking correct timestamps from an auto-generated transcript.
     ## Instructions:
     1. Paste your auto-generated transcript (with correct timestamps)
+    2. Paste your human-edited transcript (with old timestamps that need updating)
     3. Click "Update Timestamps"
+    The tool will preserve all human edits and only update the timestamps.
     """)
     with gr.Row():
         with gr.Column():
             human_transcript = gr.Textbox(
+                label="Human-Edited Transcript (timestamps need updating)",
                 placeholder="Paste your human-edited transcript here...",
                 lines=15
             )
         with gr.TabItem("Report"):
             report = gr.Markdown(
+                label="Matching Report",
                 value="Report will appear here..."
             )