import gradio as gr import re import difflib from typing import List, Dict, Tuple, Optional from dataclasses import dataclass @dataclass class Segment: """A segment of a transcript with speaker, timestamp, and text""" speaker: str timestamp: str text: str index: int # Position in the original list def extract_segments(transcript): """ Extract segments from a transcript. Works with both formats: - Speaker LastName 00:00:00 - **Speaker LastName** *00:00:00* """ # This regex matches both markdown and plain text formats pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)" segments = [] for i, match in enumerate(re.finditer(pattern, transcript, re.DOTALL)): speaker, timestamp, text = match.groups() segments.append(Segment(speaker, timestamp, text.strip(), i)) return segments def clean_text_for_matching(text): """Clean text for better matching between transcripts""" # Remove markdown links but keep the text text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) # Remove markdown formatting text = re.sub(r'\*\*|\*', '', text) # Remove punctuation and normalize whitespace text = re.sub(r'[,.;:!?()[\]{}]', ' ', text) text = re.sub(r'\s+', ' ', text) return text.lower().strip() def find_best_matches(auto_segments, human_segments): """ Find the best matching segments between auto and human transcripts. Uses text similarity to match segments. """ matches = {} # Prepare cleaned texts for comparison auto_cleaned_texts = [clean_text_for_matching(seg.text) for seg in auto_segments] human_cleaned_texts = [clean_text_for_matching(seg.text) for seg in human_segments] # For each human segment, find the best matching auto segment for h_idx, h_text in enumerate(human_cleaned_texts): best_match = -1 best_score = 0.6 # Minimum similarity threshold for a_idx, a_text in enumerate(auto_cleaned_texts): # Skip already matched segments if a_idx in matches.values(): continue # Calculate similarity similarity = difflib.SequenceMatcher(None, h_text, a_text).ratio() # If this is the best match so far, record it if similarity > best_score: best_score = similarity best_match = a_idx # If we found a good match, record it if best_match != -1: matches[h_idx] = best_match return matches def update_timestamps(human_transcript, auto_transcript): """ Update timestamps in human transcript using timestamps from auto transcript. """ # Extract segments from both transcripts human_segments = extract_segments(human_transcript) auto_segments = extract_segments(auto_transcript) if not human_segments or not auto_segments: return "Error: Could not parse transcripts. Check formatting.", "" # Find matching segments based on text similarity matches = find_best_matches(auto_segments, human_segments) # Create updated transcript with new timestamps updated_transcript = human_transcript # Replace timestamps in reverse order to avoid position shifts for h_idx in sorted(matches.keys(), reverse=True): a_idx = matches[h_idx] human_seg = human_segments[h_idx] auto_seg = auto_segments[a_idx] # Determine if markdown is used is_markdown = "**" in human_transcript # Create regex patterns to match the timestamp in the original text if is_markdown: pattern = fr"\*\*{human_seg.speaker}\*\*\s+\*{human_seg.timestamp}\*" replacement = f"**{human_seg.speaker}** *{auto_seg.timestamp}*" else: pattern = fr"{human_seg.speaker}\s+{human_seg.timestamp}" replacement = f"{human_seg.speaker} {auto_seg.timestamp}" # Replace the timestamp in the transcript updated_transcript = re.sub(pattern, replacement, updated_transcript, 1) # Generate report match_count = len(matches) human_count = len(human_segments) auto_count = len(auto_segments) report = f"### Timestamp Update Report\n\n" report += f"- Human segments: {human_count}\n" report += f"- Auto segments: {auto_count}\n" report += f"- Matched segments with updated timestamps: {match_count} ({match_count/human_count*100:.1f}%)\n" if match_count < human_count: report += f"- Segments not updated: {human_count - match_count}\n" # Print some example matches for verification if matches: report += "\n### Example matches (for verification):\n\n" # Show up to 5 matches sample_matches = list(matches.items())[:5] for h_idx, a_idx in sample_matches: h_seg = human_segments[h_idx] a_seg = auto_segments[a_idx] # Truncate text samples for readability h_preview = h_seg.text[:50] + "..." if len(h_seg.text) > 50 else h_seg.text a_preview = a_seg.text[:50] + "..." if len(a_seg.text) > 50 else a_seg.text report += f"- {h_seg.speaker}: timestamp changed from `{h_seg.timestamp}` to `{a_seg.timestamp}`\n" report += f" - Human: \"{h_preview}\"\n" report += f" - Auto: \"{a_preview}\"\n\n" return updated_transcript, report # Create Gradio interface with gr.Blocks(title="Transcript Timestamp Updater") as demo: gr.Markdown(""" # 🎙️ Transcript Timestamp Updater This tool updates timestamps in a human-edited transcript by taking correct timestamps from an auto-generated transcript. ## Instructions: 1. Paste your auto-generated transcript (with correct timestamps) 2. Paste your human-edited transcript (with old timestamps that need updating) 3. Click "Update Timestamps" The tool will preserve all human edits and only update the timestamps. """) with gr.Row(): with gr.Column(): auto_transcript = gr.Textbox( label="Auto-Generated Transcript (with correct timestamps)", placeholder="Paste the auto-generated transcript here...", lines=15 ) with gr.Column(): human_transcript = gr.Textbox( label="Human-Edited Transcript (timestamps need updating)", placeholder="Paste your human-edited transcript here...", lines=15 ) update_btn = gr.Button("Update Timestamps") with gr.Tabs(): with gr.TabItem("Updated Transcript"): updated_transcript = gr.TextArea( label="Updated Transcript", placeholder="The updated transcript will appear here...", lines=20 ) with gr.TabItem("Report"): report = gr.Markdown( label="Matching Report", value="Report will appear here..." ) update_btn.click( fn=update_timestamps, inputs=[human_transcript, auto_transcript], outputs=[updated_transcript, report] ) # Launch the app if __name__ == "__main__": demo.launch()