Spaces:
Running
Running
import gradio as gr | |
import re | |
import difflib | |
from typing import List, Dict, Tuple, Optional | |
import numpy as np | |
from dataclasses import dataclass | |
class Segment: | |
"""Represents a transcript segment""" | |
speaker: str | |
timestamp: str | |
text: str | |
raw_text: str # For matching purposes - original text without formatting | |
class Match: | |
"""Represents a match between segments""" | |
auto_index: int | |
human_index: int | |
similarity: float | |
def parse_auto_transcript(transcript: str) -> List[Segment]: | |
"""Parse the auto-generated transcript""" | |
# Pattern to match "Speaker X 00:00:00" followed by text | |
pattern = r"(?:\*\*)?Speaker (\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?Speaker |\Z)" | |
segments = [] | |
for match in re.finditer(pattern, transcript, re.DOTALL): | |
speaker, timestamp, text = match.groups() | |
# Remove any markdown formatting for matching purposes | |
raw_text = re.sub(r'\*\*|\*', '', text.strip()) | |
segments.append(Segment(speaker, timestamp, text.strip(), raw_text)) | |
return segments | |
def parse_human_transcript(transcript: str) -> List[Segment]: | |
"""Parse the human-edited transcript""" | |
# Pattern to match both markdown and plain text formats | |
# This handles both "**Speaker X** *00:00:00*" and "Speaker X 00:00:00" | |
pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)" | |
segments = [] | |
for match in re.finditer(pattern, transcript, re.DOTALL): | |
speaker, timestamp, text = match.groups() | |
# Remove any markdown formatting for matching purposes | |
raw_text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text.strip()) | |
segments.append(Segment(speaker, timestamp, text.strip(), raw_text)) | |
return segments | |
def similarity_score(text1: str, text2: str) -> float: | |
"""Calculate similarity between two text segments""" | |
# Remove all markdown, punctuation, and lowercase for better matching | |
clean1 = re.sub(r'[^\w\s]', '', text1.lower()) | |
clean2 = re.sub(r'[^\w\s]', '', text2.lower()) | |
# Use difflib's SequenceMatcher for similarity | |
return difflib.SequenceMatcher(None, clean1, clean2).ratio() | |
def find_best_matches(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Match]: | |
"""Find the best matching segments between auto and human transcripts""" | |
matches = [] | |
used_human_indices = set() | |
# First pass: Find obvious matches (high similarity) | |
for auto_idx, auto_segment in enumerate(auto_segments): | |
best_match_idx = -1 | |
best_similarity = 0.0 | |
for human_idx, human_segment in enumerate(human_segments): | |
if human_idx in used_human_indices: | |
continue | |
similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text) | |
if similarity > best_similarity and similarity >= 0.6: # Threshold for a good match | |
best_similarity = similarity | |
best_match_idx = human_idx | |
if best_match_idx >= 0: | |
matches.append(Match(auto_idx, best_match_idx, best_similarity)) | |
used_human_indices.add(best_match_idx) | |
# Second pass: Try to match remaining segments with a lower threshold | |
for auto_idx, auto_segment in enumerate(auto_segments): | |
if any(m.auto_index == auto_idx for m in matches): | |
continue | |
best_match_idx = -1 | |
best_similarity = 0.0 | |
for human_idx, human_segment in enumerate(human_segments): | |
if human_idx in used_human_indices: | |
continue | |
similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text) | |
if similarity > best_similarity and similarity >= 0.4: # Lower threshold | |
best_similarity = similarity | |
best_match_idx = human_idx | |
if best_match_idx >= 0: | |
matches.append(Match(auto_idx, best_match_idx, best_similarity)) | |
used_human_indices.add(best_match_idx) | |
return matches | |
def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Match]) -> str: | |
"""Update timestamps in human transcript based on matches""" | |
# Create a new list for the updated segments | |
updated_segments = human_segments.copy() | |
for match in matches: | |
auto_segment = auto_segments[match.auto_index] | |
human_segment = human_segments[match.human_index] | |
# Update the timestamp in the human segment | |
updated_segments[match.human_index] = Segment( | |
speaker=human_segment.speaker, | |
timestamp=auto_segment.timestamp, | |
text=human_segment.text, | |
raw_text=human_segment.raw_text | |
) | |
# Generate the updated transcript | |
result = [] | |
for segment in updated_segments: | |
# Check if this is a markdown-formatted transcript | |
if "**" in human_segments[0].text or "*" in human_segments[0].timestamp: | |
result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}") | |
else: | |
result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}") | |
return "\n\n".join(result) | |
def find_unmatched_segments(auto_segments: List[Segment], matches: List[Match]) -> List[int]: | |
"""Find segments in the auto transcript that weren't matched""" | |
matched_auto_indices = {match.auto_index for match in matches} | |
return [i for i in range(len(auto_segments)) if i not in matched_auto_indices] | |
def format_unmatched_segments(auto_segments: List[Segment], unmatched_indices: List[int], is_markdown: bool) -> str: | |
"""Format unmatched segments for display""" | |
if not unmatched_indices: | |
return "No unmatched segments found" | |
result = [] | |
for idx in unmatched_indices: | |
segment = auto_segments[idx] | |
if is_markdown: | |
result.append(f"**Speaker {segment.speaker}** *{segment.timestamp}*\n\n{segment.text}") | |
else: | |
result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}") | |
return "### Unmatched Segments (New Content)\n\n" + "\n\n".join(result) | |
def process_transcripts(auto_transcript: str, human_transcript: str): | |
"""Process transcripts and update timestamps""" | |
# Parse both transcripts | |
auto_segments = parse_auto_transcript(auto_transcript) | |
human_segments = parse_human_transcript(human_transcript) | |
# Early check for empty inputs | |
if not auto_segments or not human_segments: | |
return "Error: Could not parse one or both transcripts. Please check the format.", "", "" | |
# Find matches between segments | |
matches = find_best_matches(auto_segments, human_segments) | |
# Find unmatched segments | |
unmatched_indices = find_unmatched_segments(auto_segments, matches) | |
# Determine if we're using markdown | |
is_markdown = "**" in human_transcript or "*" in human_transcript | |
# Update timestamps | |
updated_transcript = update_timestamps(auto_segments, human_segments, matches) | |
# Format unmatched segments | |
unmatched_segments = format_unmatched_segments(auto_segments, unmatched_indices, is_markdown) | |
# Stats about the matching | |
stats = f"### Matching Statistics\n\n" | |
stats += f"- Auto-generated segments: {len(auto_segments)}\n" | |
stats += f"- Human-edited segments: {len(human_segments)}\n" | |
stats += f"- Matched segments: {len(matches)}\n" | |
stats += f"- Unmatched segments: {len(unmatched_indices)}\n" | |
# Add match quality histogram | |
if matches: | |
similarities = [match.similarity for match in matches] | |
stats += f"- Average match similarity: {sum(similarities)/len(similarities):.2f}\n" | |
# Histogram of match qualities | |
bins = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] | |
hist, _ = np.histogram(similarities, bins=bins) | |
stats += "\n#### Match Quality Distribution\n\n" | |
for i, count in enumerate(hist): | |
lower = bins[i] | |
upper = bins[i+1] | |
stats += f"- {lower:.1f}-{upper:.1f}: {count} matches\n" | |
return updated_transcript, unmatched_segments, stats | |
# Create Gradio interface | |
with gr.Blocks(title="Transcript Timestamp Updater") as demo: | |
gr.Markdown(""" | |
# Transcript Timestamp Updater | |
This tool updates timestamps in a human-edited transcript based on a new auto-generated transcript. | |
## Instructions: | |
1. Paste your new auto-generated transcript (with updated timestamps) | |
2. Paste your human-edited transcript (with old timestamps) | |
3. Click "Update Timestamps" to generate a new version of the human-edited transcript with updated timestamps | |
The tool will try to match segments between the two transcripts and update the timestamps accordingly. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
auto_transcript = gr.Textbox( | |
label="New Auto-Generated Transcript (with updated timestamps)", | |
placeholder="Paste the new auto-generated transcript here...", | |
lines=15 | |
) | |
with gr.Column(): | |
human_transcript = gr.Textbox( | |
label="Human-Edited Transcript (with old timestamps)", | |
placeholder="Paste your human-edited transcript here...", | |
lines=15 | |
) | |
update_btn = gr.Button("Update Timestamps") | |
with gr.Tabs(): | |
with gr.TabItem("Updated Transcript"): | |
updated_transcript = gr.TextArea( | |
label="Updated Human Transcript", | |
placeholder="The updated transcript will appear here...", | |
lines=20 | |
) | |
with gr.TabItem("Unmatched Segments"): | |
unmatched_segments = gr.Markdown( | |
label="Unmatched Segments", | |
value="Unmatched segments will appear here..." | |
) | |
with gr.TabItem("Statistics"): | |
stats = gr.Markdown( | |
label="Matching Statistics", | |
value="Statistics will appear here..." | |
) | |
update_btn.click( | |
fn=process_transcripts, | |
inputs=[auto_transcript, human_transcript], | |
outputs=[updated_transcript, unmatched_segments, stats] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() |