Spaces:

dwarkesh
/

transcript-aligner

Sleeping

App Files Files Community

transcript-aligner / app.py

dwarkesh

Update app.py

214a4d6 verified about 1 month ago

raw

history blame

8.74 kB

	import gradio as gr
	import re
	import difflib
	from typing import List, Dict, Tuple, Optional
	from dataclasses import dataclass

	@dataclass
	class Segment:
	"""Represents a transcript segment"""
	speaker: str
	timestamp: str
	text: str
	raw_text: str # For matching purposes - original text without formatting

	def parse_transcript(transcript: str) -> List[Segment]:
	"""Parse a transcript into segments, handling both markdown and plain formats"""
	# This pattern matches both markdown and plain text formats:
	# - "Speaker X 00:00:00" or "Speaker X 00:00:00"
	pattern = r"(?:\\)?(?:Speaker )?(\w+)(?:\\)? (?:\)?(\d{2}:\d{2}:\d{2})(?:\)?\s\n\n(.?)(?=\n\n(?:\\)?(?:Speaker )?\|\Z)"
	segments = []

	for match in re.finditer(pattern, transcript, re.DOTALL):
	speaker, timestamp, text = match.groups()
	# Remove any markdown formatting for matching purposes
	raw_text = re.sub(r'\\\|\\|\[.?\]\(.*?\)', '', text.strip())
	segments.append(Segment(speaker, timestamp, text.strip(), raw_text))

	return segments

	def clean_text_for_comparison(text: str) -> str:
	"""Clean text for better comparison"""
	# Remove all markdown, punctuation, and lowercase for better matching
	text = re.sub(r'\\\|\\|\[.?\]\(.*?\)', '', text)
	text = re.sub(r'[^\w\s]', '', text.lower())
	return text.strip()

	def match_segments(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Tuple[int, int]]:
	"""Match segments between auto and human transcripts using text similarity
	Returns list of tuples (auto_index, human_index)"""
	matches = []

	# Prepare clean versions of texts for comparison
	auto_texts = [clean_text_for_comparison(seg.raw_text) for seg in auto_segments]
	human_texts = [clean_text_for_comparison(seg.raw_text) for seg in human_segments]

	# Try to match each human segment to an auto segment
	for human_idx, human_text in enumerate(human_texts):
	best_match_idx = -1
	best_similarity = 0

	for auto_idx, auto_text in enumerate(auto_texts):
	# Skip if this auto segment is already matched
	if any(match[0] == auto_idx for match in matches):
	continue

	# Calculate similarity
	similarity = difflib.SequenceMatcher(None, auto_text, human_text).ratio()

	if similarity > best_similarity and similarity >= 0.6: # Threshold
	best_similarity = similarity
	best_match_idx = auto_idx

	if best_match_idx >= 0:
	matches.append((best_match_idx, human_idx))

	return matches

	def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Tuple[int, int]]) -> str:
	"""Update timestamps in human transcript based on matches"""
	updated_segments = human_segments.copy()

	# Update timestamps based on matches
	for auto_idx, human_idx in matches:
	# Keep the human-edited text, update only the timestamp
	updated_segments[human_idx] = Segment(
	speaker=human_segments[human_idx].speaker,
	timestamp=auto_segments[auto_idx].timestamp,
	text=human_segments[human_idx].text,
	raw_text=human_segments[human_idx].raw_text
	)

	# Determine if the human transcript uses markdown formatting
	is_markdown = "*" in human_segments[0].text or "" in human_segments[0].timestamp if human_segments else False

	# Generate the updated transcript
	result = []
	for segment in updated_segments:
	if is_markdown:
	result.append(f"{segment.speaker} {segment.timestamp}\n\n{segment.text}")
	else:
	result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")

	return "\n\n".join(result)

	def get_unmatched_auto_segments(auto_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
	"""Get indices of auto segments that weren't matched to any human segment"""
	matched_auto_indices = {match[0] for match in matches}
	return [i for i in range(len(auto_segments)) if i not in matched_auto_indices]

	def get_unmatched_human_segments(human_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
	"""Get indices of human segments that weren't matched to any auto segment"""
	matched_human_indices = {match[1] for match in matches}
	return [i for i in range(len(human_segments)) if i not in matched_human_indices]

	def format_segments(segments: List[Segment], indices: List[int], is_markdown: bool) -> str:
	"""Format segments for display"""
	if not indices:
	return "None"

	result = []
	for idx in indices:
	segment = segments[idx]
	if is_markdown:
	result.append(f"{segment.speaker} {segment.timestamp}\n\n{segment.text}")
	else:
	result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")

	return "\n\n".join(result)

	def process_transcripts(auto_transcript: str, human_transcript: str):
	"""Process transcripts and update timestamps"""
	# Parse transcripts
	auto_segments = parse_transcript(auto_transcript)
	human_segments = parse_transcript(human_transcript)

	# Basic validation
	if not auto_segments or not human_segments:
	return "Error: Could not parse transcripts. Check formatting.", "", ""

	# Match segments
	matches = match_segments(auto_segments, human_segments)

	# Find unmatched segments
	unmatched_auto = get_unmatched_auto_segments(auto_segments, matches)
	unmatched_human = get_unmatched_human_segments(human_segments, matches)

	# Determine if the format uses markdown
	is_markdown = "*" in human_transcript or "" in human_transcript

	# Update timestamps
	updated_transcript = update_timestamps(auto_segments, human_segments, matches)

	# Format statistics
	stats = f"### Matching Statistics\n\n"
	stats += f"- Auto-generated segments: {len(auto_segments)}\n"
	stats += f"- Human-edited segments: {len(human_segments)}\n"
	stats += f"- Matched segments: {len(matches)}\n"
	stats += f"- Unmatched auto segments (new content): {len(unmatched_auto)}\n"
	stats += f"- Unmatched human segments (removed content): {len(unmatched_human)}\n"

	# Format unmatched segments
	if unmatched_auto:
	stats += f"\n### New Content (In Auto-generated but not in Human-edited)\n\n"
	stats += format_segments(auto_segments, unmatched_auto, is_markdown)

	if unmatched_human:
	stats += f"\n### Removed Content (In Human-edited but not in Auto-generated)\n\n"
	stats += format_segments(human_segments, unmatched_human, is_markdown)

	return updated_transcript, stats

	# Create Gradio interface
	with gr.Blocks(title="Transcript Timestamp Updater") as demo:
	gr.Markdown("""
	# 🎙️ Transcript Timestamp Updater

	This tool updates timestamps in human-edited transcripts based on auto-generated transcripts.

	## Instructions:
	1. Paste your new auto-generated transcript (with updated timestamps)
	2. Paste your human-edited transcript (with old timestamps)
	3. Click "Update Timestamps"

	The tool will match segments between transcripts and update the timestamps while preserving all human edits.
	""")

	with gr.Row():
	with gr.Column():
	auto_transcript = gr.TextArea(
	label="Auto-Generated Transcript (with new timestamps)",
	placeholder="Paste the auto-generated transcript here...",
	lines=15
	)

	with gr.Column():
	human_transcript = gr.TextArea(
	label="Human-Edited Transcript (with old timestamps)",
	placeholder="Paste the human-edited transcript here...",
	lines=15
	)

	update_btn = gr.Button("Update Timestamps")

	with gr.Tabs():
	with gr.TabItem("Updated Transcript"):
	updated_transcript = gr.TextArea(
	label="Updated Transcript",
	placeholder="The updated transcript will appear here...",
	lines=20
	)

	with gr.TabItem("Statistics"):
	stats = gr.Markdown(
	label="Statistics",
	value="Statistics will appear here..."
	)

	update_btn.click(
	fn=process_transcripts,
	inputs=[auto_transcript, human_transcript],
	outputs=[updated_transcript, stats]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()