Spaces:
Running
Running
File size: 7,475 Bytes
4322c44 1f131a4 4322c44 5725925 fb1eceb 5725925 1f131a4 fb1eceb 1f131a4 5725925 1f131a4 4322c44 1f131a4 4322c44 1f131a4 fb1eceb 1f131a4 5725925 1f131a4 5725925 1f131a4 5725925 4322c44 5725925 fb1eceb 5725925 fb1eceb 5725925 fb1eceb 5725925 fb1eceb 1f131a4 fb1eceb 1f131a4 5725925 fb1eceb 5725925 fb1eceb 1f131a4 fb1eceb 5725925 fb1eceb 1f131a4 5725925 1f131a4 5725925 1f131a4 fb1eceb 5725925 4322c44 5725925 1f131a4 5725925 1f131a4 fb1eceb 1f131a4 fb1eceb 5725925 4322c44 1f131a4 4322c44 1f131a4 4322c44 1f131a4 4322c44 5725925 1f131a4 5725925 4322c44 1f131a4 4322c44 5725925 214a4d6 5725925 4322c44 5725925 1f131a4 5725925 4322c44 5725925 4322c44 214a4d6 4322c44 5725925 1f131a4 5725925 4322c44 5725925 4322c44 5725925 4322c44 |
|
import gradio as gr
import re
import difflib
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
@dataclass
class Segment:
"""A segment of a transcript with speaker, timestamp, and text"""
speaker: str
timestamp: str
text: str
index: int # Position in the original list
def extract_segments(transcript):
"""
Extract segments from a transcript.
Works with both formats:
- Speaker LastName 00:00:00
- **Speaker LastName** *00:00:00*
"""
# This regex matches both markdown and plain text formats
pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)"
segments = []
for i, match in enumerate(re.finditer(pattern, transcript, re.DOTALL)):
speaker, timestamp, text = match.groups()
segments.append(Segment(speaker, timestamp, text.strip(), i))
return segments
def clean_text_for_matching(text):
"""Clean text for better matching between transcripts"""
# Remove markdown links but keep the text
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
# Remove markdown formatting
text = re.sub(r'\*\*|\*', '', text)
# Remove punctuation and normalize whitespace
text = re.sub(r'[,.;:!?()[\]{}]', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.lower().strip()
def find_best_matches(auto_segments, human_segments):
"""
Find the best matching segments between auto and human transcripts.
Uses text similarity to match segments.
"""
matches = {}
# Prepare cleaned texts for comparison
auto_cleaned_texts = [clean_text_for_matching(seg.text) for seg in auto_segments]
human_cleaned_texts = [clean_text_for_matching(seg.text) for seg in human_segments]
# For each human segment, find the best matching auto segment
for h_idx, h_text in enumerate(human_cleaned_texts):
best_match = -1
best_score = 0.6 # Minimum similarity threshold
for a_idx, a_text in enumerate(auto_cleaned_texts):
# Skip already matched segments
if a_idx in matches.values():
continue
# Calculate similarity
similarity = difflib.SequenceMatcher(None, h_text, a_text).ratio()
# If this is the best match so far, record it
if similarity > best_score:
best_score = similarity
best_match = a_idx
# If we found a good match, record it
if best_match != -1:
matches[h_idx] = best_match
return matches
def update_timestamps(human_transcript, auto_transcript):
"""
Update timestamps in human transcript using timestamps from auto transcript.
"""
# Extract segments from both transcripts
human_segments = extract_segments(human_transcript)
auto_segments = extract_segments(auto_transcript)
if not human_segments or not auto_segments:
return "Error: Could not parse transcripts. Check formatting.", ""
# Find matching segments based on text similarity
matches = find_best_matches(auto_segments, human_segments)
# Create updated transcript with new timestamps
updated_transcript = human_transcript
# Replace timestamps in reverse order to avoid position shifts
for h_idx in sorted(matches.keys(), reverse=True):
a_idx = matches[h_idx]
human_seg = human_segments[h_idx]
auto_seg = auto_segments[a_idx]
# Determine if markdown is used
is_markdown = "**" in human_transcript
# Create regex patterns to match the timestamp in the original text
if is_markdown:
pattern = fr"\*\*{human_seg.speaker}\*\*\s+\*{human_seg.timestamp}\*"
replacement = f"**{human_seg.speaker}** *{auto_seg.timestamp}*"
else:
pattern = fr"{human_seg.speaker}\s+{human_seg.timestamp}"
replacement = f"{human_seg.speaker} {auto_seg.timestamp}"
# Replace the timestamp in the transcript
updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)
# Generate report
match_count = len(matches)
human_count = len(human_segments)
auto_count = len(auto_segments)
report = f"### Timestamp Update Report\n\n"
report += f"- Human segments: {human_count}\n"
report += f"- Auto segments: {auto_count}\n"
report += f"- Matched segments with updated timestamps: {match_count} ({match_count/human_count*100:.1f}%)\n"
if match_count < human_count:
report += f"- Segments not updated: {human_count - match_count}\n"
# Print some example matches for verification
if matches:
report += "\n### Example matches (for verification):\n\n"
# Show up to 5 matches
sample_matches = list(matches.items())[:5]
for h_idx, a_idx in sample_matches:
h_seg = human_segments[h_idx]
a_seg = auto_segments[a_idx]
# Truncate text samples for readability
h_preview = h_seg.text[:50] + "..." if len(h_seg.text) > 50 else h_seg.text
a_preview = a_seg.text[:50] + "..." if len(a_seg.text) > 50 else a_seg.text
report += f"- {h_seg.speaker}: timestamp changed from `{h_seg.timestamp}` to `{a_seg.timestamp}`\n"
report += f" - Human: \"{h_preview}\"\n"
report += f" - Auto: \"{a_preview}\"\n\n"
return updated_transcript, report
# Create Gradio interface
with gr.Blocks(title="Transcript Timestamp Updater") as demo:
gr.Markdown("""
# ๐๏ธ Transcript Timestamp Updater
This tool updates timestamps in a human-edited transcript by taking correct timestamps from an auto-generated transcript.
## Instructions:
1. Paste your auto-generated transcript (with correct timestamps)
2. Paste your human-edited transcript (with old timestamps that need updating)
3. Click "Update Timestamps"
The tool will preserve all human edits and only update the timestamps.
""")
with gr.Row():
with gr.Column():
auto_transcript = gr.Textbox(
label="Auto-Generated Transcript (with correct timestamps)",
placeholder="Paste the auto-generated transcript here...",
lines=15
)
with gr.Column():
human_transcript = gr.Textbox(
label="Human-Edited Transcript (timestamps need updating)",
placeholder="Paste your human-edited transcript here...",
lines=15
)
update_btn = gr.Button("Update Timestamps")
with gr.Tabs():
with gr.TabItem("Updated Transcript"):
updated_transcript = gr.TextArea(
label="Updated Transcript",
placeholder="The updated transcript will appear here...",
lines=20
)
with gr.TabItem("Report"):
report = gr.Markdown(
label="Matching Report",
value="Report will appear here..."
)
update_btn.click(
fn=update_timestamps,
inputs=[human_transcript, auto_transcript],
outputs=[updated_transcript, report]
)
# Launch the app
if __name__ == "__main__":
demo.launch() |