Spaces:

dwarkesh
/

transcript-aligner

Running

App Files Files Community

transcript-aligner / app.py

dwarkesh

Create app.py

4322c44 verified 3 months ago

raw

history blame

10.6 kB

	import gradio as gr
	import re
	import difflib
	from typing import List, Dict, Tuple, Optional
	import numpy as np
	from dataclasses import dataclass

	@dataclass
	class Segment:
	"""Represents a transcript segment"""
	speaker: str
	timestamp: str
	text: str
	raw_text: str # For matching purposes - original text without formatting

	@dataclass
	class Match:
	"""Represents a match between segments"""
	auto_index: int
	human_index: int
	similarity: float

	def parse_auto_transcript(transcript: str) -> List[Segment]:
	"""Parse the auto-generated transcript"""
	# Pattern to match "Speaker X 00:00:00" followed by text
	pattern = r"(?:\\)?Speaker (\w+)(?:\\)? (?:\)?(\d{2}:\d{2}:\d{2})(?:\)?\s\n\n(.?)(?=\n\n(?:\\)?Speaker \|\Z)"
	segments = []

	for match in re.finditer(pattern, transcript, re.DOTALL):
	speaker, timestamp, text = match.groups()
	# Remove any markdown formatting for matching purposes
	raw_text = re.sub(r'\\\|\*', '', text.strip())
	segments.append(Segment(speaker, timestamp, text.strip(), raw_text))

	return segments

	def parse_human_transcript(transcript: str) -> List[Segment]:
	"""Parse the human-edited transcript"""
	# Pattern to match both markdown and plain text formats
	# This handles both "Speaker X 00:00:00" and "Speaker X 00:00:00"
	pattern = r"(?:\\)?(?:Speaker )?(\w+)(?:\\)? (?:\)?(\d{2}:\d{2}:\d{2})(?:\)?\s\n\n(.?)(?=\n\n(?:\\)?(?:Speaker )?\|\Z)"
	segments = []

	for match in re.finditer(pattern, transcript, re.DOTALL):
	speaker, timestamp, text = match.groups()
	# Remove any markdown formatting for matching purposes
	raw_text = re.sub(r'\\\|\\|\[.?\]\(.*?\)', '', text.strip())
	segments.append(Segment(speaker, timestamp, text.strip(), raw_text))

	return segments

	def similarity_score(text1: str, text2: str) -> float:
	"""Calculate similarity between two text segments"""
	# Remove all markdown, punctuation, and lowercase for better matching
	clean1 = re.sub(r'[^\w\s]', '', text1.lower())
	clean2 = re.sub(r'[^\w\s]', '', text2.lower())

	# Use difflib's SequenceMatcher for similarity
	return difflib.SequenceMatcher(None, clean1, clean2).ratio()

	def find_best_matches(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Match]:
	"""Find the best matching segments between auto and human transcripts"""
	matches = []
	used_human_indices = set()

	# First pass: Find obvious matches (high similarity)
	for auto_idx, auto_segment in enumerate(auto_segments):
	best_match_idx = -1
	best_similarity = 0.0

	for human_idx, human_segment in enumerate(human_segments):
	if human_idx in used_human_indices:
	continue

	similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text)

	if similarity > best_similarity and similarity >= 0.6: # Threshold for a good match
	best_similarity = similarity
	best_match_idx = human_idx

	if best_match_idx >= 0:
	matches.append(Match(auto_idx, best_match_idx, best_similarity))
	used_human_indices.add(best_match_idx)

	# Second pass: Try to match remaining segments with a lower threshold
	for auto_idx, auto_segment in enumerate(auto_segments):
	if any(m.auto_index == auto_idx for m in matches):
	continue

	best_match_idx = -1
	best_similarity = 0.0

	for human_idx, human_segment in enumerate(human_segments):
	if human_idx in used_human_indices:
	continue

	similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text)

	if similarity > best_similarity and similarity >= 0.4: # Lower threshold
	best_similarity = similarity
	best_match_idx = human_idx

	if best_match_idx >= 0:
	matches.append(Match(auto_idx, best_match_idx, best_similarity))
	used_human_indices.add(best_match_idx)

	return matches

	def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Match]) -> str:
	"""Update timestamps in human transcript based on matches"""
	# Create a new list for the updated segments
	updated_segments = human_segments.copy()

	for match in matches:
	auto_segment = auto_segments[match.auto_index]
	human_segment = human_segments[match.human_index]

	# Update the timestamp in the human segment
	updated_segments[match.human_index] = Segment(
	speaker=human_segment.speaker,
	timestamp=auto_segment.timestamp,
	text=human_segment.text,
	raw_text=human_segment.raw_text
	)

	# Generate the updated transcript
	result = []
	for segment in updated_segments:
	# Check if this is a markdown-formatted transcript
	if "*" in human_segments[0].text or "" in human_segments[0].timestamp:
	result.append(f"{segment.speaker} {segment.timestamp}\n\n{segment.text}")
	else:
	result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")

	return "\n\n".join(result)

	def find_unmatched_segments(auto_segments: List[Segment], matches: List[Match]) -> List[int]:
	"""Find segments in the auto transcript that weren't matched"""
	matched_auto_indices = {match.auto_index for match in matches}
	return [i for i in range(len(auto_segments)) if i not in matched_auto_indices]

	def format_unmatched_segments(auto_segments: List[Segment], unmatched_indices: List[int], is_markdown: bool) -> str:
	"""Format unmatched segments for display"""
	if not unmatched_indices:
	return "No unmatched segments found"

	result = []
	for idx in unmatched_indices:
	segment = auto_segments[idx]
	if is_markdown:
	result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
	else:
	result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")

	return "### Unmatched Segments (New Content)\n\n" + "\n\n".join(result)

	def process_transcripts(auto_transcript: str, human_transcript: str):
	"""Process transcripts and update timestamps"""
	# Parse both transcripts
	auto_segments = parse_auto_transcript(auto_transcript)
	human_segments = parse_human_transcript(human_transcript)

	# Early check for empty inputs
	if not auto_segments or not human_segments:
	return "Error: Could not parse one or both transcripts. Please check the format.", "", ""

	# Find matches between segments
	matches = find_best_matches(auto_segments, human_segments)

	# Find unmatched segments
	unmatched_indices = find_unmatched_segments(auto_segments, matches)

	# Determine if we're using markdown
	is_markdown = "*" in human_transcript or "" in human_transcript

	# Update timestamps
	updated_transcript = update_timestamps(auto_segments, human_segments, matches)

	# Format unmatched segments
	unmatched_segments = format_unmatched_segments(auto_segments, unmatched_indices, is_markdown)

	# Stats about the matching
	stats = f"### Matching Statistics\n\n"
	stats += f"- Auto-generated segments: {len(auto_segments)}\n"
	stats += f"- Human-edited segments: {len(human_segments)}\n"
	stats += f"- Matched segments: {len(matches)}\n"
	stats += f"- Unmatched segments: {len(unmatched_indices)}\n"

	# Add match quality histogram
	if matches:
	similarities = [match.similarity for match in matches]
	stats += f"- Average match similarity: {sum(similarities)/len(similarities):.2f}\n"

	# Histogram of match qualities
	bins = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
	hist, _ = np.histogram(similarities, bins=bins)
	stats += "\n#### Match Quality Distribution\n\n"
	for i, count in enumerate(hist):
	lower = bins[i]
	upper = bins[i+1]
	stats += f"- {lower:.1f}-{upper:.1f}: {count} matches\n"

	return updated_transcript, unmatched_segments, stats

	# Create Gradio interface
	with gr.Blocks(title="Transcript Timestamp Updater") as demo:
	gr.Markdown("""
	# Transcript Timestamp Updater

	This tool updates timestamps in a human-edited transcript based on a new auto-generated transcript.

	## Instructions:
	1. Paste your new auto-generated transcript (with updated timestamps)
	2. Paste your human-edited transcript (with old timestamps)
	3. Click "Update Timestamps" to generate a new version of the human-edited transcript with updated timestamps

	The tool will try to match segments between the two transcripts and update the timestamps accordingly.
	""")

	with gr.Row():
	with gr.Column():
	auto_transcript = gr.Textbox(
	label="New Auto-Generated Transcript (with updated timestamps)",
	placeholder="Paste the new auto-generated transcript here...",
	lines=15
	)

	with gr.Column():
	human_transcript = gr.Textbox(
	label="Human-Edited Transcript (with old timestamps)",
	placeholder="Paste your human-edited transcript here...",
	lines=15
	)

	update_btn = gr.Button("Update Timestamps")

	with gr.Tabs():
	with gr.TabItem("Updated Transcript"):
	updated_transcript = gr.TextArea(
	label="Updated Human Transcript",
	placeholder="The updated transcript will appear here...",
	lines=20
	)

	with gr.TabItem("Unmatched Segments"):
	unmatched_segments = gr.Markdown(
	label="Unmatched Segments",
	value="Unmatched segments will appear here..."
	)

	with gr.TabItem("Statistics"):
	stats = gr.Markdown(
	label="Matching Statistics",
	value="Statistics will appear here..."
	)

	update_btn.click(
	fn=process_transcripts,
	inputs=[auto_transcript, human_transcript],
	outputs=[updated_transcript, unmatched_segments, stats]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()