Spaces:
Sleeping
Sleeping
File size: 5,415 Bytes
4322c44 5725925 4322c44 5725925 fb1eceb 5725925 fb1eceb 5725925 fb1eceb 5725925 4322c44 5725925 4322c44 5725925 fb1eceb 5725925 fb1eceb 5725925 4322c44 5725925 fb1eceb 5725925 fb1eceb 5725925 fb1eceb 5725925 fb1eceb 5725925 fb1eceb 5725925 fb1eceb 5725925 fb1eceb 5725925 fb1eceb 5725925 fb1eceb 5725925 fb1eceb 5725925 4322c44 5725925 fb1eceb 5725925 fb1eceb 5725925 4322c44 5725925 4322c44 5725925 4322c44 5725925 4322c44 5725925 4322c44 5725925 4322c44 5725925 214a4d6 5725925 4322c44 5725925 4322c44 5725925 4322c44 214a4d6 4322c44 5725925 4322c44 5725925 4322c44 5725925 4322c44 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import gradio as gr
import re
from typing import List, Dict, Tuple
def extract_segments(transcript):
"""
Extract segments from a transcript.
Returns a list of tuples: (speaker, timestamp, text)
"""
pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)"
segments = []
for match in re.finditer(pattern, transcript, re.DOTALL):
speaker, timestamp, text = match.groups()
segments.append((speaker, timestamp, text.strip()))
return segments
def find_matching_segments(auto_segments, human_segments):
"""
Find matching segments between auto and human transcripts.
Returns a dictionary mapping human segment index to auto segment index.
Very simple matching based on speaker sequence - assumes both transcripts
have the same speakers in the same order, just with different timestamps.
"""
matches = {}
# Group segments by speaker
auto_by_speaker = {}
for i, (speaker, _, _) in enumerate(auto_segments):
if speaker not in auto_by_speaker:
auto_by_speaker[speaker] = []
auto_by_speaker[speaker].append(i)
# Match segments by speaker order
for h_idx, (speaker, _, _) in enumerate(human_segments):
if speaker in auto_by_speaker and auto_by_speaker[speaker]:
# Get the next available segment for this speaker
matches[h_idx] = auto_by_speaker[speaker].pop(0)
return matches
def update_timestamps(human_transcript, auto_transcript):
"""
Update timestamps in human transcript using timestamps from auto transcript.
Preserves all human edits and formatting.
"""
# Extract segments from both transcripts
human_segments = extract_segments(human_transcript)
auto_segments = extract_segments(auto_transcript)
if not human_segments or not auto_segments:
return "Error: Could not parse transcripts. Check formatting.", ""
# Find matching segments
matches = find_matching_segments(auto_segments, human_segments)
# Create updated transcript
updated_transcript = human_transcript
# Replace timestamps in reverse order to avoid position shifts
for h_idx in sorted(matches.keys(), reverse=True):
a_idx = matches[h_idx]
h_speaker, h_timestamp, _ = human_segments[h_idx]
_, a_timestamp, _ = auto_segments[a_idx]
# Determine if markdown is used
is_markdown = "**" in human_transcript
# Create patterns to match the timestamp in the original text
if is_markdown:
# For markdown format: **Speaker** *00:00:00*
pattern = fr"\*\*{h_speaker}\*\*\s+\*{h_timestamp}\*"
replacement = f"**{h_speaker}** *{a_timestamp}*"
else:
# For plain format: Speaker 00:00:00
pattern = fr"{h_speaker}\s+{h_timestamp}"
replacement = f"{h_speaker} {a_timestamp}"
# Replace the timestamp in the transcript
updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)
# Generate report
report = f"### Timestamp Update Report\n\n"
report += f"- Human segments: {len(human_segments)}\n"
report += f"- Auto segments: {len(auto_segments)}\n"
report += f"- Updated timestamps: {len(matches)}\n"
if len(matches) < len(human_segments):
unmatched = len(human_segments) - len(matches)
report += f"- Segments not updated: {unmatched}\n"
return updated_transcript, report
# Create Gradio interface
with gr.Blocks(title="Simple Transcript Timestamp Updater") as demo:
gr.Markdown("""
# ๐๏ธ Simple Transcript Timestamp Updater
This tool updates timestamps in a human-edited transcript based on an auto-generated transcript.
## Instructions:
1. Paste your auto-generated transcript (with correct timestamps)
2. Paste your human-edited transcript (with old timestamps)
3. Click "Update Timestamps"
The tool will update only the timestamps while preserving all human edits.
""")
with gr.Row():
with gr.Column():
auto_transcript = gr.Textbox(
label="Auto-Generated Transcript (with correct timestamps)",
placeholder="Paste the auto-generated transcript here...",
lines=15
)
with gr.Column():
human_transcript = gr.Textbox(
label="Human-Edited Transcript (with old timestamps)",
placeholder="Paste your human-edited transcript here...",
lines=15
)
update_btn = gr.Button("Update Timestamps")
with gr.Tabs():
with gr.TabItem("Updated Transcript"):
updated_transcript = gr.TextArea(
label="Updated Transcript",
placeholder="The updated transcript will appear here...",
lines=20
)
with gr.TabItem("Report"):
report = gr.Markdown(
label="Report",
value="Report will appear here..."
)
update_btn.click(
fn=update_timestamps,
inputs=[human_transcript, auto_transcript],
outputs=[updated_transcript, report]
)
# Launch the app
if __name__ == "__main__":
demo.launch() |