import gradio as gr import re import difflib from typing import List, Dict, Tuple, Optional import numpy as np from dataclasses import dataclass @dataclass class Segment: """Represents a transcript segment""" speaker: str timestamp: str text: str raw_text: str # For matching purposes - original text without formatting @dataclass class Match: """Represents a match between segments""" auto_index: int human_index: int similarity: float def parse_auto_transcript(transcript: str) -> List[Segment]: """Parse the auto-generated transcript""" # Pattern to match "Speaker X 00:00:00" followed by text pattern = r"(?:\*\*)?Speaker (\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?Speaker |\Z)" segments = [] for match in re.finditer(pattern, transcript, re.DOTALL): speaker, timestamp, text = match.groups() # Remove any markdown formatting for matching purposes raw_text = re.sub(r'\*\*|\*', '', text.strip()) segments.append(Segment(speaker, timestamp, text.strip(), raw_text)) return segments def parse_human_transcript(transcript: str) -> List[Segment]: """Parse the human-edited transcript""" # Pattern to match both markdown and plain text formats # This handles both "**Speaker X** *00:00:00*" and "Speaker X 00:00:00" pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)" segments = [] for match in re.finditer(pattern, transcript, re.DOTALL): speaker, timestamp, text = match.groups() # Remove any markdown formatting for matching purposes raw_text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text.strip()) segments.append(Segment(speaker, timestamp, text.strip(), raw_text)) return segments def similarity_score(text1: str, text2: str) -> float: """Calculate similarity between two text segments""" # Remove all markdown, punctuation, and lowercase for better matching clean1 = re.sub(r'[^\w\s]', '', text1.lower()) clean2 = re.sub(r'[^\w\s]', '', text2.lower()) # Use difflib's SequenceMatcher for similarity return difflib.SequenceMatcher(None, clean1, clean2).ratio() def find_best_matches(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Match]: """Find the best matching segments between auto and human transcripts""" matches = [] used_human_indices = set() # First pass: Find obvious matches (high similarity) for auto_idx, auto_segment in enumerate(auto_segments): best_match_idx = -1 best_similarity = 0.0 for human_idx, human_segment in enumerate(human_segments): if human_idx in used_human_indices: continue similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text) if similarity > best_similarity and similarity >= 0.6: # Threshold for a good match best_similarity = similarity best_match_idx = human_idx if best_match_idx >= 0: matches.append(Match(auto_idx, best_match_idx, best_similarity)) used_human_indices.add(best_match_idx) # Second pass: Try to match remaining segments with a lower threshold for auto_idx, auto_segment in enumerate(auto_segments): if any(m.auto_index == auto_idx for m in matches): continue best_match_idx = -1 best_similarity = 0.0 for human_idx, human_segment in enumerate(human_segments): if human_idx in used_human_indices: continue similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text) if similarity > best_similarity and similarity >= 0.4: # Lower threshold best_similarity = similarity best_match_idx = human_idx if best_match_idx >= 0: matches.append(Match(auto_idx, best_match_idx, best_similarity)) used_human_indices.add(best_match_idx) return matches def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Match]) -> str: """Update timestamps in human transcript based on matches""" # Create a new list for the updated segments updated_segments = human_segments.copy() for match in matches: auto_segment = auto_segments[match.auto_index] human_segment = human_segments[match.human_index] # Update the timestamp in the human segment updated_segments[match.human_index] = Segment( speaker=human_segment.speaker, timestamp=auto_segment.timestamp, text=human_segment.text, raw_text=human_segment.raw_text ) # Generate the updated transcript result = [] for segment in updated_segments: # Check if this is a markdown-formatted transcript if "**" in human_segments[0].text or "*" in human_segments[0].timestamp: result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}") else: result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}") return "\n\n".join(result) def find_unmatched_segments(auto_segments: List[Segment], matches: List[Match]) -> List[int]: """Find segments in the auto transcript that weren't matched""" matched_auto_indices = {match.auto_index for match in matches} return [i for i in range(len(auto_segments)) if i not in matched_auto_indices] def format_unmatched_segments(auto_segments: List[Segment], unmatched_indices: List[int], is_markdown: bool) -> str: """Format unmatched segments for display""" if not unmatched_indices: return "No unmatched segments found" result = [] for idx in unmatched_indices: segment = auto_segments[idx] if is_markdown: result.append(f"**Speaker {segment.speaker}** *{segment.timestamp}*\n\n{segment.text}") else: result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}") return "### Unmatched Segments (New Content)\n\n" + "\n\n".join(result) def process_transcripts(auto_transcript: str, human_transcript: str): """Process transcripts and update timestamps""" # Parse both transcripts auto_segments = parse_auto_transcript(auto_transcript) human_segments = parse_human_transcript(human_transcript) # Early check for empty inputs if not auto_segments or not human_segments: return "Error: Could not parse one or both transcripts. Please check the format.", "", "" # Find matches between segments matches = find_best_matches(auto_segments, human_segments) # Find unmatched segments unmatched_indices = find_unmatched_segments(auto_segments, matches) # Determine if we're using markdown is_markdown = "**" in human_transcript or "*" in human_transcript # Update timestamps updated_transcript = update_timestamps(auto_segments, human_segments, matches) # Format unmatched segments unmatched_segments = format_unmatched_segments(auto_segments, unmatched_indices, is_markdown) # Stats about the matching stats = f"### Matching Statistics\n\n" stats += f"- Auto-generated segments: {len(auto_segments)}\n" stats += f"- Human-edited segments: {len(human_segments)}\n" stats += f"- Matched segments: {len(matches)}\n" stats += f"- Unmatched segments: {len(unmatched_indices)}\n" # Add match quality histogram if matches: similarities = [match.similarity for match in matches] stats += f"- Average match similarity: {sum(similarities)/len(similarities):.2f}\n" # Histogram of match qualities bins = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] hist, _ = np.histogram(similarities, bins=bins) stats += "\n#### Match Quality Distribution\n\n" for i, count in enumerate(hist): lower = bins[i] upper = bins[i+1] stats += f"- {lower:.1f}-{upper:.1f}: {count} matches\n" return updated_transcript, unmatched_segments, stats # Create Gradio interface with gr.Blocks(title="Transcript Timestamp Updater") as demo: gr.Markdown(""" # Transcript Timestamp Updater This tool updates timestamps in a human-edited transcript based on a new auto-generated transcript. ## Instructions: 1. Paste your new auto-generated transcript (with updated timestamps) 2. Paste your human-edited transcript (with old timestamps) 3. Click "Update Timestamps" to generate a new version of the human-edited transcript with updated timestamps The tool will try to match segments between the two transcripts and update the timestamps accordingly. """) with gr.Row(): with gr.Column(): auto_transcript = gr.Textbox( label="New Auto-Generated Transcript (with updated timestamps)", placeholder="Paste the new auto-generated transcript here...", lines=15 ) with gr.Column(): human_transcript = gr.Textbox( label="Human-Edited Transcript (with old timestamps)", placeholder="Paste your human-edited transcript here...", lines=15 ) update_btn = gr.Button("Update Timestamps") with gr.Tabs(): with gr.TabItem("Updated Transcript"): updated_transcript = gr.TextArea( label="Updated Human Transcript", placeholder="The updated transcript will appear here...", lines=20 ) with gr.TabItem("Unmatched Segments"): unmatched_segments = gr.Markdown( label="Unmatched Segments", value="Unmatched segments will appear here..." ) with gr.TabItem("Statistics"): stats = gr.Markdown( label="Matching Statistics", value="Statistics will appear here..." ) update_btn.click( fn=process_transcripts, inputs=[auto_transcript, human_transcript], outputs=[updated_transcript, unmatched_segments, stats] ) # Launch the app if __name__ == "__main__": demo.launch()