Spaces:
Sleeping
Sleeping
import gradio as gr | |
import re | |
import difflib | |
from typing import List, Dict, Tuple, Optional | |
from dataclasses import dataclass | |
class Segment: | |
"""Represents a transcript segment""" | |
speaker: str | |
timestamp: str | |
text: str | |
raw_text: str # For matching purposes - original text without formatting | |
def parse_transcript(transcript: str) -> List[Segment]: | |
"""Parse a transcript into segments, handling both markdown and plain formats""" | |
# This pattern matches both markdown and plain text formats: | |
# - "**Speaker X** *00:00:00*" or "Speaker X 00:00:00" | |
pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)" | |
segments = [] | |
for match in re.finditer(pattern, transcript, re.DOTALL): | |
speaker, timestamp, text = match.groups() | |
# Remove any markdown formatting for matching purposes | |
raw_text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text.strip()) | |
segments.append(Segment(speaker, timestamp, text.strip(), raw_text)) | |
return segments | |
def clean_text_for_comparison(text: str) -> str: | |
"""Clean text for better comparison""" | |
# Remove all markdown, punctuation, and lowercase for better matching | |
text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text) | |
text = re.sub(r'[^\w\s]', '', text.lower()) | |
return text.strip() | |
def match_segments(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Tuple[int, int]]: | |
"""Match segments between auto and human transcripts using text similarity | |
Returns list of tuples (auto_index, human_index)""" | |
matches = [] | |
# Prepare clean versions of texts for comparison | |
auto_texts = [clean_text_for_comparison(seg.raw_text) for seg in auto_segments] | |
human_texts = [clean_text_for_comparison(seg.raw_text) for seg in human_segments] | |
# Try to match each human segment to an auto segment | |
for human_idx, human_text in enumerate(human_texts): | |
best_match_idx = -1 | |
best_similarity = 0 | |
for auto_idx, auto_text in enumerate(auto_texts): | |
# Skip if this auto segment is already matched | |
if any(match[0] == auto_idx for match in matches): | |
continue | |
# Calculate similarity | |
similarity = difflib.SequenceMatcher(None, auto_text, human_text).ratio() | |
if similarity > best_similarity and similarity >= 0.6: # Threshold | |
best_similarity = similarity | |
best_match_idx = auto_idx | |
if best_match_idx >= 0: | |
matches.append((best_match_idx, human_idx)) | |
return matches | |
def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Tuple[int, int]]) -> str: | |
"""Update timestamps in human transcript based on matches""" | |
updated_segments = human_segments.copy() | |
# Update timestamps based on matches | |
for auto_idx, human_idx in matches: | |
# Keep the human-edited text, update only the timestamp | |
updated_segments[human_idx] = Segment( | |
speaker=human_segments[human_idx].speaker, | |
timestamp=auto_segments[auto_idx].timestamp, | |
text=human_segments[human_idx].text, | |
raw_text=human_segments[human_idx].raw_text | |
) | |
# Determine if the human transcript uses markdown formatting | |
is_markdown = "**" in human_segments[0].text or "*" in human_segments[0].timestamp if human_segments else False | |
# Generate the updated transcript | |
result = [] | |
for segment in updated_segments: | |
if is_markdown: | |
result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}") | |
else: | |
result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}") | |
return "\n\n".join(result) | |
def get_unmatched_auto_segments(auto_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]: | |
"""Get indices of auto segments that weren't matched to any human segment""" | |
matched_auto_indices = {match[0] for match in matches} | |
return [i for i in range(len(auto_segments)) if i not in matched_auto_indices] | |
def get_unmatched_human_segments(human_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]: | |
"""Get indices of human segments that weren't matched to any auto segment""" | |
matched_human_indices = {match[1] for match in matches} | |
return [i for i in range(len(human_segments)) if i not in matched_human_indices] | |
def format_segments(segments: List[Segment], indices: List[int], is_markdown: bool) -> str: | |
"""Format segments for display""" | |
if not indices: | |
return "None" | |
result = [] | |
for idx in indices: | |
segment = segments[idx] | |
if is_markdown: | |
result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}") | |
else: | |
result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}") | |
return "\n\n".join(result) | |
def process_transcripts(auto_transcript: str, human_transcript: str): | |
"""Process transcripts and update timestamps""" | |
# Parse transcripts | |
auto_segments = parse_transcript(auto_transcript) | |
human_segments = parse_transcript(human_transcript) | |
# Basic validation | |
if not auto_segments or not human_segments: | |
return "Error: Could not parse transcripts. Check formatting.", "", "" | |
# Match segments | |
matches = match_segments(auto_segments, human_segments) | |
# Find unmatched segments | |
unmatched_auto = get_unmatched_auto_segments(auto_segments, matches) | |
unmatched_human = get_unmatched_human_segments(human_segments, matches) | |
# Determine if the format uses markdown | |
is_markdown = "**" in human_transcript or "*" in human_transcript | |
# Update timestamps | |
updated_transcript = update_timestamps(auto_segments, human_segments, matches) | |
# Format statistics | |
stats = f"### Matching Statistics\n\n" | |
stats += f"- Auto-generated segments: {len(auto_segments)}\n" | |
stats += f"- Human-edited segments: {len(human_segments)}\n" | |
stats += f"- Matched segments: {len(matches)}\n" | |
stats += f"- Unmatched auto segments (new content): {len(unmatched_auto)}\n" | |
stats += f"- Unmatched human segments (removed content): {len(unmatched_human)}\n" | |
# Format unmatched segments | |
if unmatched_auto: | |
stats += f"\n### New Content (In Auto-generated but not in Human-edited)\n\n" | |
stats += format_segments(auto_segments, unmatched_auto, is_markdown) | |
if unmatched_human: | |
stats += f"\n### Removed Content (In Human-edited but not in Auto-generated)\n\n" | |
stats += format_segments(human_segments, unmatched_human, is_markdown) | |
return updated_transcript, stats | |
# Create Gradio interface | |
with gr.Blocks(title="Transcript Timestamp Updater") as demo: | |
gr.Markdown(""" | |
# ๐๏ธ Transcript Timestamp Updater | |
This tool updates timestamps in human-edited transcripts based on auto-generated transcripts. | |
## Instructions: | |
1. Paste your new auto-generated transcript (with updated timestamps) | |
2. Paste your human-edited transcript (with old timestamps) | |
3. Click "Update Timestamps" | |
The tool will match segments between transcripts and update the timestamps while preserving all human edits. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
auto_transcript = gr.TextArea( | |
label="Auto-Generated Transcript (with new timestamps)", | |
placeholder="Paste the auto-generated transcript here...", | |
lines=15 | |
) | |
with gr.Column(): | |
human_transcript = gr.TextArea( | |
label="Human-Edited Transcript (with old timestamps)", | |
placeholder="Paste the human-edited transcript here...", | |
lines=15 | |
) | |
update_btn = gr.Button("Update Timestamps") | |
with gr.Tabs(): | |
with gr.TabItem("Updated Transcript"): | |
updated_transcript = gr.TextArea( | |
label="Updated Transcript", | |
placeholder="The updated transcript will appear here...", | |
lines=20 | |
) | |
with gr.TabItem("Statistics"): | |
stats = gr.Markdown( | |
label="Statistics", | |
value="Statistics will appear here..." | |
) | |
update_btn.click( | |
fn=process_transcripts, | |
inputs=[auto_transcript, human_transcript], | |
outputs=[updated_transcript, stats] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() |