dwarkesh commited on
Commit
1f131a4
·
verified ·
1 Parent(s): 5725925

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -48
app.py CHANGED
@@ -1,50 +1,86 @@
1
  import gradio as gr
2
  import re
3
- from typing import List, Dict, Tuple
 
 
 
 
 
 
 
 
 
 
4
 
5
  def extract_segments(transcript):
6
  """
7
  Extract segments from a transcript.
8
- Returns a list of tuples: (speaker, timestamp, text)
 
 
9
  """
 
10
  pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)"
11
- segments = []
12
 
13
- for match in re.finditer(pattern, transcript, re.DOTALL):
 
14
  speaker, timestamp, text = match.groups()
15
- segments.append((speaker, timestamp, text.strip()))
16
 
17
  return segments
18
 
19
- def find_matching_segments(auto_segments, human_segments):
20
- """
21
- Find matching segments between auto and human transcripts.
22
- Returns a dictionary mapping human segment index to auto segment index.
23
 
24
- Very simple matching based on speaker sequence - assumes both transcripts
25
- have the same speakers in the same order, just with different timestamps.
 
 
 
 
 
 
 
 
 
 
 
26
  """
27
  matches = {}
28
 
29
- # Group segments by speaker
30
- auto_by_speaker = {}
31
- for i, (speaker, _, _) in enumerate(auto_segments):
32
- if speaker not in auto_by_speaker:
33
- auto_by_speaker[speaker] = []
34
- auto_by_speaker[speaker].append(i)
35
 
36
- # Match segments by speaker order
37
- for h_idx, (speaker, _, _) in enumerate(human_segments):
38
- if speaker in auto_by_speaker and auto_by_speaker[speaker]:
39
- # Get the next available segment for this speaker
40
- matches[h_idx] = auto_by_speaker[speaker].pop(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  return matches
43
 
44
  def update_timestamps(human_transcript, auto_transcript):
45
  """
46
  Update timestamps in human transcript using timestamps from auto transcript.
47
- Preserves all human edits and formatting.
48
  """
49
  # Extract segments from both transcripts
50
  human_segments = extract_segments(human_transcript)
@@ -53,60 +89,79 @@ def update_timestamps(human_transcript, auto_transcript):
53
  if not human_segments or not auto_segments:
54
  return "Error: Could not parse transcripts. Check formatting.", ""
55
 
56
- # Find matching segments
57
- matches = find_matching_segments(auto_segments, human_segments)
58
 
59
- # Create updated transcript
60
  updated_transcript = human_transcript
61
 
62
  # Replace timestamps in reverse order to avoid position shifts
63
  for h_idx in sorted(matches.keys(), reverse=True):
64
  a_idx = matches[h_idx]
65
 
66
- h_speaker, h_timestamp, _ = human_segments[h_idx]
67
- _, a_timestamp, _ = auto_segments[a_idx]
68
 
69
  # Determine if markdown is used
70
  is_markdown = "**" in human_transcript
71
 
72
- # Create patterns to match the timestamp in the original text
73
  if is_markdown:
74
- # For markdown format: **Speaker** *00:00:00*
75
- pattern = fr"\*\*{h_speaker}\*\*\s+\*{h_timestamp}\*"
76
- replacement = f"**{h_speaker}** *{a_timestamp}*"
77
  else:
78
- # For plain format: Speaker 00:00:00
79
- pattern = fr"{h_speaker}\s+{h_timestamp}"
80
- replacement = f"{h_speaker} {a_timestamp}"
81
 
82
  # Replace the timestamp in the transcript
83
  updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)
84
 
85
  # Generate report
 
 
 
 
86
  report = f"### Timestamp Update Report\n\n"
87
- report += f"- Human segments: {len(human_segments)}\n"
88
- report += f"- Auto segments: {len(auto_segments)}\n"
89
- report += f"- Updated timestamps: {len(matches)}\n"
 
 
 
90
 
91
- if len(matches) < len(human_segments):
92
- unmatched = len(human_segments) - len(matches)
93
- report += f"- Segments not updated: {unmatched}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  return updated_transcript, report
96
 
97
  # Create Gradio interface
98
- with gr.Blocks(title="Simple Transcript Timestamp Updater") as demo:
99
  gr.Markdown("""
100
- # 🎙️ Simple Transcript Timestamp Updater
101
 
102
- This tool updates timestamps in a human-edited transcript based on an auto-generated transcript.
103
 
104
  ## Instructions:
105
  1. Paste your auto-generated transcript (with correct timestamps)
106
- 2. Paste your human-edited transcript (with old timestamps)
107
  3. Click "Update Timestamps"
108
 
109
- The tool will update only the timestamps while preserving all human edits.
110
  """)
111
 
112
  with gr.Row():
@@ -119,7 +174,7 @@ with gr.Blocks(title="Simple Transcript Timestamp Updater") as demo:
119
 
120
  with gr.Column():
121
  human_transcript = gr.Textbox(
122
- label="Human-Edited Transcript (with old timestamps)",
123
  placeholder="Paste your human-edited transcript here...",
124
  lines=15
125
  )
@@ -136,7 +191,7 @@ with gr.Blocks(title="Simple Transcript Timestamp Updater") as demo:
136
 
137
  with gr.TabItem("Report"):
138
  report = gr.Markdown(
139
- label="Report",
140
  value="Report will appear here..."
141
  )
142
 
 
1
  import gradio as gr
2
  import re
3
+ import difflib
4
+ from typing import List, Dict, Tuple, Optional
5
+ from dataclasses import dataclass
6
+
7
+ @dataclass
8
+ class Segment:
9
+ """A segment of a transcript with speaker, timestamp, and text"""
10
+ speaker: str
11
+ timestamp: str
12
+ text: str
13
+ index: int # Position in the original list
14
 
15
  def extract_segments(transcript):
16
  """
17
  Extract segments from a transcript.
18
+ Works with both formats:
19
+ - Speaker LastName 00:00:00
20
+ - **Speaker LastName** *00:00:00*
21
  """
22
+ # This regex matches both markdown and plain text formats
23
  pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)"
 
24
 
25
+ segments = []
26
+ for i, match in enumerate(re.finditer(pattern, transcript, re.DOTALL)):
27
  speaker, timestamp, text = match.groups()
28
+ segments.append(Segment(speaker, timestamp, text.strip(), i))
29
 
30
  return segments
31
 
32
+ def clean_text_for_matching(text):
33
+ """Clean text for better matching between transcripts"""
34
+ # Remove markdown links but keep the text
35
+ text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
36
 
37
+ # Remove markdown formatting
38
+ text = re.sub(r'\*\*|\*', '', text)
39
+
40
+ # Remove punctuation and normalize whitespace
41
+ text = re.sub(r'[,.;:!?()[\]{}]', ' ', text)
42
+ text = re.sub(r'\s+', ' ', text)
43
+
44
+ return text.lower().strip()
45
+
46
+ def find_best_matches(auto_segments, human_segments):
47
+ """
48
+ Find the best matching segments between auto and human transcripts.
49
+ Uses text similarity to match segments.
50
  """
51
  matches = {}
52
 
53
+ # Prepare cleaned texts for comparison
54
+ auto_cleaned_texts = [clean_text_for_matching(seg.text) for seg in auto_segments]
55
+ human_cleaned_texts = [clean_text_for_matching(seg.text) for seg in human_segments]
 
 
 
56
 
57
+ # For each human segment, find the best matching auto segment
58
+ for h_idx, h_text in enumerate(human_cleaned_texts):
59
+ best_match = -1
60
+ best_score = 0.6 # Minimum similarity threshold
61
+
62
+ for a_idx, a_text in enumerate(auto_cleaned_texts):
63
+ # Skip already matched segments
64
+ if a_idx in matches.values():
65
+ continue
66
+
67
+ # Calculate similarity
68
+ similarity = difflib.SequenceMatcher(None, h_text, a_text).ratio()
69
+
70
+ # If this is the best match so far, record it
71
+ if similarity > best_score:
72
+ best_score = similarity
73
+ best_match = a_idx
74
+
75
+ # If we found a good match, record it
76
+ if best_match != -1:
77
+ matches[h_idx] = best_match
78
 
79
  return matches
80
 
81
  def update_timestamps(human_transcript, auto_transcript):
82
  """
83
  Update timestamps in human transcript using timestamps from auto transcript.
 
84
  """
85
  # Extract segments from both transcripts
86
  human_segments = extract_segments(human_transcript)
 
89
  if not human_segments or not auto_segments:
90
  return "Error: Could not parse transcripts. Check formatting.", ""
91
 
92
+ # Find matching segments based on text similarity
93
+ matches = find_best_matches(auto_segments, human_segments)
94
 
95
+ # Create updated transcript with new timestamps
96
  updated_transcript = human_transcript
97
 
98
  # Replace timestamps in reverse order to avoid position shifts
99
  for h_idx in sorted(matches.keys(), reverse=True):
100
  a_idx = matches[h_idx]
101
 
102
+ human_seg = human_segments[h_idx]
103
+ auto_seg = auto_segments[a_idx]
104
 
105
  # Determine if markdown is used
106
  is_markdown = "**" in human_transcript
107
 
108
+ # Create regex patterns to match the timestamp in the original text
109
  if is_markdown:
110
+ pattern = fr"\*\*{human_seg.speaker}\*\*\s+\*{human_seg.timestamp}\*"
111
+ replacement = f"**{human_seg.speaker}** *{auto_seg.timestamp}*"
 
112
  else:
113
+ pattern = fr"{human_seg.speaker}\s+{human_seg.timestamp}"
114
+ replacement = f"{human_seg.speaker} {auto_seg.timestamp}"
 
115
 
116
  # Replace the timestamp in the transcript
117
  updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)
118
 
119
  # Generate report
120
+ match_count = len(matches)
121
+ human_count = len(human_segments)
122
+ auto_count = len(auto_segments)
123
+
124
  report = f"### Timestamp Update Report\n\n"
125
+ report += f"- Human segments: {human_count}\n"
126
+ report += f"- Auto segments: {auto_count}\n"
127
+ report += f"- Matched segments with updated timestamps: {match_count} ({match_count/human_count*100:.1f}%)\n"
128
+
129
+ if match_count < human_count:
130
+ report += f"- Segments not updated: {human_count - match_count}\n"
131
 
132
+ # Print some example matches for verification
133
+ if matches:
134
+ report += "\n### Example matches (for verification):\n\n"
135
+
136
+ # Show up to 5 matches
137
+ sample_matches = list(matches.items())[:5]
138
+ for h_idx, a_idx in sample_matches:
139
+ h_seg = human_segments[h_idx]
140
+ a_seg = auto_segments[a_idx]
141
+
142
+ # Truncate text samples for readability
143
+ h_preview = h_seg.text[:50] + "..." if len(h_seg.text) > 50 else h_seg.text
144
+ a_preview = a_seg.text[:50] + "..." if len(a_seg.text) > 50 else a_seg.text
145
+
146
+ report += f"- {h_seg.speaker}: timestamp changed from `{h_seg.timestamp}` to `{a_seg.timestamp}`\n"
147
+ report += f" - Human: \"{h_preview}\"\n"
148
+ report += f" - Auto: \"{a_preview}\"\n\n"
149
 
150
  return updated_transcript, report
151
 
152
  # Create Gradio interface
153
+ with gr.Blocks(title="Transcript Timestamp Updater") as demo:
154
  gr.Markdown("""
155
+ # 🎙️ Transcript Timestamp Updater
156
 
157
+ This tool updates timestamps in a human-edited transcript by taking correct timestamps from an auto-generated transcript.
158
 
159
  ## Instructions:
160
  1. Paste your auto-generated transcript (with correct timestamps)
161
+ 2. Paste your human-edited transcript (with old timestamps that need updating)
162
  3. Click "Update Timestamps"
163
 
164
+ The tool will preserve all human edits and only update the timestamps.
165
  """)
166
 
167
  with gr.Row():
 
174
 
175
  with gr.Column():
176
  human_transcript = gr.Textbox(
177
+ label="Human-Edited Transcript (timestamps need updating)",
178
  placeholder="Paste your human-edited transcript here...",
179
  lines=15
180
  )
 
191
 
192
  with gr.TabItem("Report"):
193
  report = gr.Markdown(
194
+ label="Matching Report",
195
  value="Report will appear here..."
196
  )
197