dwarkesh commited on
Commit
214a4d6
·
verified ·
1 Parent(s): 4322c44

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -139
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  import re
3
  import difflib
4
  from typing import List, Dict, Tuple, Optional
5
- import numpy as np
6
  from dataclasses import dataclass
7
 
8
  @dataclass
@@ -13,31 +12,10 @@ class Segment:
13
  text: str
14
  raw_text: str # For matching purposes - original text without formatting
15
 
16
- @dataclass
17
- class Match:
18
- """Represents a match between segments"""
19
- auto_index: int
20
- human_index: int
21
- similarity: float
22
-
23
- def parse_auto_transcript(transcript: str) -> List[Segment]:
24
- """Parse the auto-generated transcript"""
25
- # Pattern to match "Speaker X 00:00:00" followed by text
26
- pattern = r"(?:\*\*)?Speaker (\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?Speaker |\Z)"
27
- segments = []
28
-
29
- for match in re.finditer(pattern, transcript, re.DOTALL):
30
- speaker, timestamp, text = match.groups()
31
- # Remove any markdown formatting for matching purposes
32
- raw_text = re.sub(r'\*\*|\*', '', text.strip())
33
- segments.append(Segment(speaker, timestamp, text.strip(), raw_text))
34
-
35
- return segments
36
-
37
- def parse_human_transcript(transcript: str) -> List[Segment]:
38
- """Parse the human-edited transcript"""
39
- # Pattern to match both markdown and plain text formats
40
- # This handles both "**Speaker X** *00:00:00*" and "Speaker X 00:00:00"
41
  pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)"
42
  segments = []
43
 
@@ -49,186 +27,165 @@ def parse_human_transcript(transcript: str) -> List[Segment]:
49
 
50
  return segments
51
 
52
- def similarity_score(text1: str, text2: str) -> float:
53
- """Calculate similarity between two text segments"""
54
  # Remove all markdown, punctuation, and lowercase for better matching
55
- clean1 = re.sub(r'[^\w\s]', '', text1.lower())
56
- clean2 = re.sub(r'[^\w\s]', '', text2.lower())
57
-
58
- # Use difflib's SequenceMatcher for similarity
59
- return difflib.SequenceMatcher(None, clean1, clean2).ratio()
60
 
61
- def find_best_matches(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Match]:
62
- """Find the best matching segments between auto and human transcripts"""
 
63
  matches = []
64
- used_human_indices = set()
65
 
66
- # First pass: Find obvious matches (high similarity)
67
- for auto_idx, auto_segment in enumerate(auto_segments):
68
- best_match_idx = -1
69
- best_similarity = 0.0
70
-
71
- for human_idx, human_segment in enumerate(human_segments):
72
- if human_idx in used_human_indices:
73
- continue
74
-
75
- similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text)
76
-
77
- if similarity > best_similarity and similarity >= 0.6: # Threshold for a good match
78
- best_similarity = similarity
79
- best_match_idx = human_idx
80
-
81
- if best_match_idx >= 0:
82
- matches.append(Match(auto_idx, best_match_idx, best_similarity))
83
- used_human_indices.add(best_match_idx)
84
 
85
- # Second pass: Try to match remaining segments with a lower threshold
86
- for auto_idx, auto_segment in enumerate(auto_segments):
87
- if any(m.auto_index == auto_idx for m in matches):
88
- continue
89
-
90
  best_match_idx = -1
91
- best_similarity = 0.0
92
 
93
- for human_idx, human_segment in enumerate(human_segments):
94
- if human_idx in used_human_indices:
 
95
  continue
96
 
97
- similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text)
 
98
 
99
- if similarity > best_similarity and similarity >= 0.4: # Lower threshold
100
  best_similarity = similarity
101
- best_match_idx = human_idx
102
 
103
  if best_match_idx >= 0:
104
- matches.append(Match(auto_idx, best_match_idx, best_similarity))
105
- used_human_indices.add(best_match_idx)
106
 
107
  return matches
108
 
109
- def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Match]) -> str:
110
  """Update timestamps in human transcript based on matches"""
111
- # Create a new list for the updated segments
112
  updated_segments = human_segments.copy()
113
 
114
- for match in matches:
115
- auto_segment = auto_segments[match.auto_index]
116
- human_segment = human_segments[match.human_index]
117
-
118
- # Update the timestamp in the human segment
119
- updated_segments[match.human_index] = Segment(
120
- speaker=human_segment.speaker,
121
- timestamp=auto_segment.timestamp,
122
- text=human_segment.text,
123
- raw_text=human_segment.raw_text
124
  )
125
 
 
 
 
126
  # Generate the updated transcript
127
  result = []
128
  for segment in updated_segments:
129
- # Check if this is a markdown-formatted transcript
130
- if "**" in human_segments[0].text or "*" in human_segments[0].timestamp:
131
  result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
132
  else:
133
  result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
134
 
135
  return "\n\n".join(result)
136
 
137
- def find_unmatched_segments(auto_segments: List[Segment], matches: List[Match]) -> List[int]:
138
- """Find segments in the auto transcript that weren't matched"""
139
- matched_auto_indices = {match.auto_index for match in matches}
140
  return [i for i in range(len(auto_segments)) if i not in matched_auto_indices]
141
 
142
- def format_unmatched_segments(auto_segments: List[Segment], unmatched_indices: List[int], is_markdown: bool) -> str:
143
- """Format unmatched segments for display"""
144
- if not unmatched_indices:
145
- return "No unmatched segments found"
 
 
 
 
 
146
 
147
  result = []
148
- for idx in unmatched_indices:
149
- segment = auto_segments[idx]
150
  if is_markdown:
151
- result.append(f"**Speaker {segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
152
  else:
153
  result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
154
 
155
- return "### Unmatched Segments (New Content)\n\n" + "\n\n".join(result)
156
 
157
  def process_transcripts(auto_transcript: str, human_transcript: str):
158
  """Process transcripts and update timestamps"""
159
- # Parse both transcripts
160
- auto_segments = parse_auto_transcript(auto_transcript)
161
- human_segments = parse_human_transcript(human_transcript)
162
 
163
- # Early check for empty inputs
164
  if not auto_segments or not human_segments:
165
- return "Error: Could not parse one or both transcripts. Please check the format.", "", ""
166
-
167
- # Find matches between segments
168
- matches = find_best_matches(auto_segments, human_segments)
169
 
170
  # Find unmatched segments
171
- unmatched_indices = find_unmatched_segments(auto_segments, matches)
 
172
 
173
- # Determine if we're using markdown
174
  is_markdown = "**" in human_transcript or "*" in human_transcript
175
 
176
  # Update timestamps
177
  updated_transcript = update_timestamps(auto_segments, human_segments, matches)
178
 
179
- # Format unmatched segments
180
- unmatched_segments = format_unmatched_segments(auto_segments, unmatched_indices, is_markdown)
181
-
182
- # Stats about the matching
183
  stats = f"### Matching Statistics\n\n"
184
  stats += f"- Auto-generated segments: {len(auto_segments)}\n"
185
  stats += f"- Human-edited segments: {len(human_segments)}\n"
186
  stats += f"- Matched segments: {len(matches)}\n"
187
- stats += f"- Unmatched segments: {len(unmatched_indices)}\n"
 
188
 
189
- # Add match quality histogram
190
- if matches:
191
- similarities = [match.similarity for match in matches]
192
- stats += f"- Average match similarity: {sum(similarities)/len(similarities):.2f}\n"
193
-
194
- # Histogram of match qualities
195
- bins = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
196
- hist, _ = np.histogram(similarities, bins=bins)
197
- stats += "\n#### Match Quality Distribution\n\n"
198
- for i, count in enumerate(hist):
199
- lower = bins[i]
200
- upper = bins[i+1]
201
- stats += f"- {lower:.1f}-{upper:.1f}: {count} matches\n"
202
 
203
- return updated_transcript, unmatched_segments, stats
204
 
205
  # Create Gradio interface
206
  with gr.Blocks(title="Transcript Timestamp Updater") as demo:
207
  gr.Markdown("""
208
- # Transcript Timestamp Updater
209
 
210
- This tool updates timestamps in a human-edited transcript based on a new auto-generated transcript.
211
 
212
  ## Instructions:
213
  1. Paste your new auto-generated transcript (with updated timestamps)
214
  2. Paste your human-edited transcript (with old timestamps)
215
- 3. Click "Update Timestamps" to generate a new version of the human-edited transcript with updated timestamps
216
 
217
- The tool will try to match segments between the two transcripts and update the timestamps accordingly.
218
  """)
219
 
220
  with gr.Row():
221
  with gr.Column():
222
- auto_transcript = gr.Textbox(
223
- label="New Auto-Generated Transcript (with updated timestamps)",
224
- placeholder="Paste the new auto-generated transcript here...",
225
  lines=15
226
  )
227
 
228
  with gr.Column():
229
- human_transcript = gr.Textbox(
230
  label="Human-Edited Transcript (with old timestamps)",
231
- placeholder="Paste your human-edited transcript here...",
232
  lines=15
233
  )
234
 
@@ -237,27 +194,21 @@ with gr.Blocks(title="Transcript Timestamp Updater") as demo:
237
  with gr.Tabs():
238
  with gr.TabItem("Updated Transcript"):
239
  updated_transcript = gr.TextArea(
240
- label="Updated Human Transcript",
241
  placeholder="The updated transcript will appear here...",
242
  lines=20
243
  )
244
 
245
- with gr.TabItem("Unmatched Segments"):
246
- unmatched_segments = gr.Markdown(
247
- label="Unmatched Segments",
248
- value="Unmatched segments will appear here..."
249
- )
250
-
251
  with gr.TabItem("Statistics"):
252
  stats = gr.Markdown(
253
- label="Matching Statistics",
254
  value="Statistics will appear here..."
255
  )
256
 
257
  update_btn.click(
258
  fn=process_transcripts,
259
  inputs=[auto_transcript, human_transcript],
260
- outputs=[updated_transcript, unmatched_segments, stats]
261
  )
262
 
263
  # Launch the app
 
2
  import re
3
  import difflib
4
  from typing import List, Dict, Tuple, Optional
 
5
  from dataclasses import dataclass
6
 
7
  @dataclass
 
12
  text: str
13
  raw_text: str # For matching purposes - original text without formatting
14
 
15
+ def parse_transcript(transcript: str) -> List[Segment]:
16
+ """Parse a transcript into segments, handling both markdown and plain formats"""
17
+ # This pattern matches both markdown and plain text formats:
18
+ # - "**Speaker X** *00:00:00*" or "Speaker X 00:00:00"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)"
20
  segments = []
21
 
 
27
 
28
  return segments
29
 
30
+ def clean_text_for_comparison(text: str) -> str:
31
+ """Clean text for better comparison"""
32
  # Remove all markdown, punctuation, and lowercase for better matching
33
+ text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text)
34
+ text = re.sub(r'[^\w\s]', '', text.lower())
35
+ return text.strip()
 
 
36
 
37
+ def match_segments(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Tuple[int, int]]:
38
+ """Match segments between auto and human transcripts using text similarity
39
+ Returns list of tuples (auto_index, human_index)"""
40
  matches = []
 
41
 
42
+ # Prepare clean versions of texts for comparison
43
+ auto_texts = [clean_text_for_comparison(seg.raw_text) for seg in auto_segments]
44
+ human_texts = [clean_text_for_comparison(seg.raw_text) for seg in human_segments]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # Try to match each human segment to an auto segment
47
+ for human_idx, human_text in enumerate(human_texts):
 
 
 
48
  best_match_idx = -1
49
+ best_similarity = 0
50
 
51
+ for auto_idx, auto_text in enumerate(auto_texts):
52
+ # Skip if this auto segment is already matched
53
+ if any(match[0] == auto_idx for match in matches):
54
  continue
55
 
56
+ # Calculate similarity
57
+ similarity = difflib.SequenceMatcher(None, auto_text, human_text).ratio()
58
 
59
+ if similarity > best_similarity and similarity >= 0.6: # Threshold
60
  best_similarity = similarity
61
+ best_match_idx = auto_idx
62
 
63
  if best_match_idx >= 0:
64
+ matches.append((best_match_idx, human_idx))
 
65
 
66
  return matches
67
 
68
+ def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Tuple[int, int]]) -> str:
69
  """Update timestamps in human transcript based on matches"""
 
70
  updated_segments = human_segments.copy()
71
 
72
+ # Update timestamps based on matches
73
+ for auto_idx, human_idx in matches:
74
+ # Keep the human-edited text, update only the timestamp
75
+ updated_segments[human_idx] = Segment(
76
+ speaker=human_segments[human_idx].speaker,
77
+ timestamp=auto_segments[auto_idx].timestamp,
78
+ text=human_segments[human_idx].text,
79
+ raw_text=human_segments[human_idx].raw_text
 
 
80
  )
81
 
82
+ # Determine if the human transcript uses markdown formatting
83
+ is_markdown = "**" in human_segments[0].text or "*" in human_segments[0].timestamp if human_segments else False
84
+
85
  # Generate the updated transcript
86
  result = []
87
  for segment in updated_segments:
88
+ if is_markdown:
 
89
  result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
90
  else:
91
  result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
92
 
93
  return "\n\n".join(result)
94
 
95
+ def get_unmatched_auto_segments(auto_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
96
+ """Get indices of auto segments that weren't matched to any human segment"""
97
+ matched_auto_indices = {match[0] for match in matches}
98
  return [i for i in range(len(auto_segments)) if i not in matched_auto_indices]
99
 
100
+ def get_unmatched_human_segments(human_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
101
+ """Get indices of human segments that weren't matched to any auto segment"""
102
+ matched_human_indices = {match[1] for match in matches}
103
+ return [i for i in range(len(human_segments)) if i not in matched_human_indices]
104
+
105
+ def format_segments(segments: List[Segment], indices: List[int], is_markdown: bool) -> str:
106
+ """Format segments for display"""
107
+ if not indices:
108
+ return "None"
109
 
110
  result = []
111
+ for idx in indices:
112
+ segment = segments[idx]
113
  if is_markdown:
114
+ result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
115
  else:
116
  result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
117
 
118
+ return "\n\n".join(result)
119
 
120
  def process_transcripts(auto_transcript: str, human_transcript: str):
121
  """Process transcripts and update timestamps"""
122
+ # Parse transcripts
123
+ auto_segments = parse_transcript(auto_transcript)
124
+ human_segments = parse_transcript(human_transcript)
125
 
126
+ # Basic validation
127
  if not auto_segments or not human_segments:
128
+ return "Error: Could not parse transcripts. Check formatting.", "", ""
129
+
130
+ # Match segments
131
+ matches = match_segments(auto_segments, human_segments)
132
 
133
  # Find unmatched segments
134
+ unmatched_auto = get_unmatched_auto_segments(auto_segments, matches)
135
+ unmatched_human = get_unmatched_human_segments(human_segments, matches)
136
 
137
+ # Determine if the format uses markdown
138
  is_markdown = "**" in human_transcript or "*" in human_transcript
139
 
140
  # Update timestamps
141
  updated_transcript = update_timestamps(auto_segments, human_segments, matches)
142
 
143
+ # Format statistics
 
 
 
144
  stats = f"### Matching Statistics\n\n"
145
  stats += f"- Auto-generated segments: {len(auto_segments)}\n"
146
  stats += f"- Human-edited segments: {len(human_segments)}\n"
147
  stats += f"- Matched segments: {len(matches)}\n"
148
+ stats += f"- Unmatched auto segments (new content): {len(unmatched_auto)}\n"
149
+ stats += f"- Unmatched human segments (removed content): {len(unmatched_human)}\n"
150
 
151
+ # Format unmatched segments
152
+ if unmatched_auto:
153
+ stats += f"\n### New Content (In Auto-generated but not in Human-edited)\n\n"
154
+ stats += format_segments(auto_segments, unmatched_auto, is_markdown)
155
+
156
+ if unmatched_human:
157
+ stats += f"\n### Removed Content (In Human-edited but not in Auto-generated)\n\n"
158
+ stats += format_segments(human_segments, unmatched_human, is_markdown)
 
 
 
 
 
159
 
160
+ return updated_transcript, stats
161
 
162
  # Create Gradio interface
163
  with gr.Blocks(title="Transcript Timestamp Updater") as demo:
164
  gr.Markdown("""
165
+ # 🎙️ Transcript Timestamp Updater
166
 
167
+ This tool updates timestamps in human-edited transcripts based on auto-generated transcripts.
168
 
169
  ## Instructions:
170
  1. Paste your new auto-generated transcript (with updated timestamps)
171
  2. Paste your human-edited transcript (with old timestamps)
172
+ 3. Click "Update Timestamps"
173
 
174
+ The tool will match segments between transcripts and update the timestamps while preserving all human edits.
175
  """)
176
 
177
  with gr.Row():
178
  with gr.Column():
179
+ auto_transcript = gr.TextArea(
180
+ label="Auto-Generated Transcript (with new timestamps)",
181
+ placeholder="Paste the auto-generated transcript here...",
182
  lines=15
183
  )
184
 
185
  with gr.Column():
186
+ human_transcript = gr.TextArea(
187
  label="Human-Edited Transcript (with old timestamps)",
188
+ placeholder="Paste the human-edited transcript here...",
189
  lines=15
190
  )
191
 
 
194
  with gr.Tabs():
195
  with gr.TabItem("Updated Transcript"):
196
  updated_transcript = gr.TextArea(
197
+ label="Updated Transcript",
198
  placeholder="The updated transcript will appear here...",
199
  lines=20
200
  )
201
 
 
 
 
 
 
 
202
  with gr.TabItem("Statistics"):
203
  stats = gr.Markdown(
204
+ label="Statistics",
205
  value="Statistics will appear here..."
206
  )
207
 
208
  update_btn.click(
209
  fn=process_transcripts,
210
  inputs=[auto_transcript, human_transcript],
211
+ outputs=[updated_transcript, stats]
212
  )
213
 
214
  # Launch the app