Spaces:

dwarkesh
/

transcript-aligner

Sleeping

App Files Files Community

transcript-aligner / app.py

dwarkesh

Update app.py

5725925 verified 3 months ago

raw

history blame

5.42 kB

	import gradio as gr
	import re
	from typing import List, Dict, Tuple

	def extract_segments(transcript):
	"""
	Extract segments from a transcript.
	Returns a list of tuples: (speaker, timestamp, text)
	"""
	pattern = r"(?:\\)?([A-Za-z]+)(?:\\)?\s+\?([0-9:]+)\?\s\n\n(.?)(?=\n\n(?:\\)?[A-Za-z]+\|\Z)"
	segments = []

	for match in re.finditer(pattern, transcript, re.DOTALL):
	speaker, timestamp, text = match.groups()
	segments.append((speaker, timestamp, text.strip()))

	return segments

	def find_matching_segments(auto_segments, human_segments):
	"""
	Find matching segments between auto and human transcripts.
	Returns a dictionary mapping human segment index to auto segment index.

	Very simple matching based on speaker sequence - assumes both transcripts
	have the same speakers in the same order, just with different timestamps.
	"""
	matches = {}

	# Group segments by speaker
	auto_by_speaker = {}
	for i, (speaker, _, _) in enumerate(auto_segments):
	if speaker not in auto_by_speaker:
	auto_by_speaker[speaker] = []
	auto_by_speaker[speaker].append(i)

	# Match segments by speaker order
	for h_idx, (speaker, _, _) in enumerate(human_segments):
	if speaker in auto_by_speaker and auto_by_speaker[speaker]:
	# Get the next available segment for this speaker
	matches[h_idx] = auto_by_speaker[speaker].pop(0)

	return matches

	def update_timestamps(human_transcript, auto_transcript):
	"""
	Update timestamps in human transcript using timestamps from auto transcript.
	Preserves all human edits and formatting.
	"""
	# Extract segments from both transcripts
	human_segments = extract_segments(human_transcript)
	auto_segments = extract_segments(auto_transcript)

	if not human_segments or not auto_segments:
	return "Error: Could not parse transcripts. Check formatting.", ""

	# Find matching segments
	matches = find_matching_segments(auto_segments, human_segments)

	# Create updated transcript
	updated_transcript = human_transcript

	# Replace timestamps in reverse order to avoid position shifts
	for h_idx in sorted(matches.keys(), reverse=True):
	a_idx = matches[h_idx]

	h_speaker, h_timestamp, _ = human_segments[h_idx]
	_, a_timestamp, _ = auto_segments[a_idx]

	# Determine if markdown is used
	is_markdown = "**" in human_transcript

	# Create patterns to match the timestamp in the original text
	if is_markdown:
	# For markdown format: Speaker 00:00:00
	pattern = fr"\\{h_speaker}\\\s+\{h_timestamp}\"
	replacement = f"{h_speaker} {a_timestamp}"
	else:
	# For plain format: Speaker 00:00:00
	pattern = fr"{h_speaker}\s+{h_timestamp}"
	replacement = f"{h_speaker} {a_timestamp}"

	# Replace the timestamp in the transcript
	updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)

	# Generate report
	report = f"### Timestamp Update Report\n\n"
	report += f"- Human segments: {len(human_segments)}\n"
	report += f"- Auto segments: {len(auto_segments)}\n"
	report += f"- Updated timestamps: {len(matches)}\n"

	if len(matches) < len(human_segments):
	unmatched = len(human_segments) - len(matches)
	report += f"- Segments not updated: {unmatched}\n"

	return updated_transcript, report

	# Create Gradio interface
	with gr.Blocks(title="Simple Transcript Timestamp Updater") as demo:
	gr.Markdown("""
	# 🎙️ Simple Transcript Timestamp Updater

	This tool updates timestamps in a human-edited transcript based on an auto-generated transcript.

	## Instructions:
	1. Paste your auto-generated transcript (with correct timestamps)
	2. Paste your human-edited transcript (with old timestamps)
	3. Click "Update Timestamps"

	The tool will update only the timestamps while preserving all human edits.
	""")

	with gr.Row():
	with gr.Column():
	auto_transcript = gr.Textbox(
	label="Auto-Generated Transcript (with correct timestamps)",
	placeholder="Paste the auto-generated transcript here...",
	lines=15
	)

	with gr.Column():
	human_transcript = gr.Textbox(
	label="Human-Edited Transcript (with old timestamps)",
	placeholder="Paste your human-edited transcript here...",
	lines=15
	)

	update_btn = gr.Button("Update Timestamps")

	with gr.Tabs():
	with gr.TabItem("Updated Transcript"):
	updated_transcript = gr.TextArea(
	label="Updated Transcript",
	placeholder="The updated transcript will appear here...",
	lines=20
	)

	with gr.TabItem("Report"):
	report = gr.Markdown(
	label="Report",
	value="Report will appear here..."
	)

	update_btn.click(
	fn=update_timestamps,
	inputs=[human_transcript, auto_transcript],
	outputs=[updated_transcript, report]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()