dwarkesh commited on
Commit
fb1eceb
·
verified ·
1 Parent(s): 214a4d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +250 -153
app.py CHANGED
@@ -1,195 +1,248 @@
1
  import gradio as gr
2
  import re
3
  import difflib
 
4
  from typing import List, Dict, Tuple, Optional
5
  from dataclasses import dataclass
 
6
 
7
  @dataclass
8
  class Segment:
9
- """Represents a transcript segment"""
10
  speaker: str
11
  timestamp: str
12
  text: str
13
- raw_text: str # For matching purposes - original text without formatting
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def parse_transcript(transcript: str) -> List[Segment]:
16
- """Parse a transcript into segments, handling both markdown and plain formats"""
17
- # This pattern matches both markdown and plain text formats:
18
- # - "**Speaker X** *00:00:00*" or "Speaker X 00:00:00"
19
- pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)"
20
- segments = []
 
 
 
21
 
22
- for match in re.finditer(pattern, transcript, re.DOTALL):
 
23
  speaker, timestamp, text = match.groups()
24
- # Remove any markdown formatting for matching purposes
25
- raw_text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text.strip())
26
- segments.append(Segment(speaker, timestamp, text.strip(), raw_text))
27
 
28
  return segments
29
 
30
- def clean_text_for_comparison(text: str) -> str:
31
- """Clean text for better comparison"""
32
- # Remove all markdown, punctuation, and lowercase for better matching
33
- text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text)
34
- text = re.sub(r'[^\w\s]', '', text.lower())
35
- return text.strip()
36
-
37
- def match_segments(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Tuple[int, int]]:
38
- """Match segments between auto and human transcripts using text similarity
39
- Returns list of tuples (auto_index, human_index)"""
40
- matches = []
41
-
42
- # Prepare clean versions of texts for comparison
43
- auto_texts = [clean_text_for_comparison(seg.raw_text) for seg in auto_segments]
44
- human_texts = [clean_text_for_comparison(seg.raw_text) for seg in human_segments]
45
-
46
- # Try to match each human segment to an auto segment
47
- for human_idx, human_text in enumerate(human_texts):
48
- best_match_idx = -1
49
- best_similarity = 0
 
 
50
 
51
- for auto_idx, auto_text in enumerate(auto_texts):
52
- # Skip if this auto segment is already matched
53
- if any(match[0] == auto_idx for match in matches):
54
- continue
55
-
56
- # Calculate similarity
57
- similarity = difflib.SequenceMatcher(None, auto_text, human_text).ratio()
58
 
59
- if similarity > best_similarity and similarity >= 0.6: # Threshold
60
- best_similarity = similarity
61
- best_match_idx = auto_idx
 
 
62
 
63
- if best_match_idx >= 0:
64
- matches.append((best_match_idx, human_idx))
 
65
 
66
- return matches
67
 
68
- def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Tuple[int, int]]) -> str:
69
- """Update timestamps in human transcript based on matches"""
70
- updated_segments = human_segments.copy()
71
-
72
- # Update timestamps based on matches
73
- for auto_idx, human_idx in matches:
74
- # Keep the human-edited text, update only the timestamp
75
- updated_segments[human_idx] = Segment(
76
- speaker=human_segments[human_idx].speaker,
77
- timestamp=auto_segments[auto_idx].timestamp,
78
- text=human_segments[human_idx].text,
79
- raw_text=human_segments[human_idx].raw_text
80
- )
81
-
82
- # Determine if the human transcript uses markdown formatting
83
- is_markdown = "**" in human_segments[0].text or "*" in human_segments[0].timestamp if human_segments else False
84
-
85
- # Generate the updated transcript
86
- result = []
87
- for segment in updated_segments:
88
- if is_markdown:
89
- result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
90
  else:
91
- result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
 
 
 
 
92
 
93
- return "\n\n".join(result)
94
-
95
- def get_unmatched_auto_segments(auto_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
96
- """Get indices of auto segments that weren't matched to any human segment"""
97
- matched_auto_indices = {match[0] for match in matches}
98
- return [i for i in range(len(auto_segments)) if i not in matched_auto_indices]
99
 
100
- def get_unmatched_human_segments(human_segments: List[Segment], matches: List[Tuple[int, int]]) -> List[int]:
101
- """Get indices of human segments that weren't matched to any auto segment"""
102
- matched_human_indices = {match[1] for match in matches}
103
- return [i for i in range(len(human_segments)) if i not in matched_human_indices]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- def format_segments(segments: List[Segment], indices: List[int], is_markdown: bool) -> str:
106
- """Format segments for display"""
107
- if not indices:
108
- return "None"
109
-
110
- result = []
111
- for idx in indices:
112
- segment = segments[idx]
113
- if is_markdown:
114
- result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
115
- else:
116
- result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- return "\n\n".join(result)
 
119
 
120
- def process_transcripts(auto_transcript: str, human_transcript: str):
121
- """Process transcripts and update timestamps"""
122
- # Parse transcripts
123
- auto_segments = parse_transcript(auto_transcript)
124
- human_segments = parse_transcript(human_transcript)
125
-
126
- # Basic validation
127
- if not auto_segments or not human_segments:
128
- return "Error: Could not parse transcripts. Check formatting.", "", ""
129
-
130
- # Match segments
131
- matches = match_segments(auto_segments, human_segments)
132
-
133
- # Find unmatched segments
134
- unmatched_auto = get_unmatched_auto_segments(auto_segments, matches)
135
- unmatched_human = get_unmatched_human_segments(human_segments, matches)
136
-
137
- # Determine if the format uses markdown
138
- is_markdown = "**" in human_transcript or "*" in human_transcript
139
-
140
- # Update timestamps
141
- updated_transcript = update_timestamps(auto_segments, human_segments, matches)
142
-
143
- # Format statistics
144
- stats = f"### Matching Statistics\n\n"
145
- stats += f"- Auto-generated segments: {len(auto_segments)}\n"
146
- stats += f"- Human-edited segments: {len(human_segments)}\n"
147
- stats += f"- Matched segments: {len(matches)}\n"
148
- stats += f"- Unmatched auto segments (new content): {len(unmatched_auto)}\n"
149
- stats += f"- Unmatched human segments (removed content): {len(unmatched_human)}\n"
150
-
151
- # Format unmatched segments
152
- if unmatched_auto:
153
- stats += f"\n### New Content (In Auto-generated but not in Human-edited)\n\n"
154
- stats += format_segments(auto_segments, unmatched_auto, is_markdown)
155
-
156
- if unmatched_human:
157
- stats += f"\n### Removed Content (In Human-edited but not in Auto-generated)\n\n"
158
- stats += format_segments(human_segments, unmatched_human, is_markdown)
159
-
160
- return updated_transcript, stats
161
 
162
  # Create Gradio interface
163
- with gr.Blocks(title="Transcript Timestamp Updater") as demo:
164
  gr.Markdown("""
165
- # 🎙️ Transcript Timestamp Updater
166
 
167
- This tool updates timestamps in human-edited transcripts based on auto-generated transcripts.
168
 
169
  ## Instructions:
170
- 1. Paste your new auto-generated transcript (with updated timestamps)
171
- 2. Paste your human-edited transcript (with old timestamps)
172
- 3. Click "Update Timestamps"
173
 
174
- The tool will match segments between transcripts and update the timestamps while preserving all human edits.
175
  """)
176
 
177
  with gr.Row():
178
  with gr.Column():
179
- auto_transcript = gr.TextArea(
180
- label="Auto-Generated Transcript (with new timestamps)",
 
 
 
 
 
 
 
 
 
 
181
  placeholder="Paste the auto-generated transcript here...",
182
- lines=15
 
183
  )
184
 
185
  with gr.Column():
186
- human_transcript = gr.TextArea(
187
- label="Human-Edited Transcript (with old timestamps)",
 
 
 
 
 
 
 
 
 
 
188
  placeholder="Paste the human-edited transcript here...",
189
- lines=15
 
190
  )
191
 
192
- update_btn = gr.Button("Update Timestamps")
193
 
194
  with gr.Tabs():
195
  with gr.TabItem("Updated Transcript"):
@@ -198,19 +251,63 @@ with gr.Blocks(title="Transcript Timestamp Updater") as demo:
198
  placeholder="The updated transcript will appear here...",
199
  lines=20
200
  )
 
 
201
 
202
- with gr.TabItem("Statistics"):
203
- stats = gr.Markdown(
204
- label="Statistics",
205
- value="Statistics will appear here..."
206
  )
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  update_btn.click(
209
- fn=process_transcripts,
210
- inputs=[auto_transcript, human_transcript],
211
- outputs=[updated_transcript, stats]
 
 
 
 
 
 
 
 
 
 
 
 
212
  )
213
 
214
- # Launch the app
215
  if __name__ == "__main__":
216
  demo.launch()
 
1
  import gradio as gr
2
  import re
3
  import difflib
4
+ import os
5
  from typing import List, Dict, Tuple, Optional
6
  from dataclasses import dataclass
7
+ import numpy as np
8
 
9
  @dataclass
10
  class Segment:
11
+ """A segment of a transcript with a speaker and text"""
12
  speaker: str
13
  timestamp: str
14
  text: str
15
+ original_text: str # The text as it appears in the original transcript
16
+ index: int # Position in the original transcript
17
+
18
+ def clean_text_for_matching(text: str) -> str:
19
+ """Clean text for matching purposes (remove formatting, punctuation, etc.)"""
20
+ # Remove markdown links and formatting
21
+ text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) # Replace markdown links with just the text
22
+ text = re.sub(r'\*\*|\*', '', text) # Remove bold and italic formatting
23
+
24
+ # Remove common filler words and punctuation for better matching
25
+ text = re.sub(r'[,.;:!?]', ' ', text)
26
+ text = re.sub(r'\s+', ' ', text)
27
+
28
+ return text.lower().strip()
29
+
30
+ def load_transcript_file(file_path: str) -> str:
31
+ """Load transcript from a file"""
32
+ with open(file_path, 'r', encoding='utf-8') as f:
33
+ return f.read()
34
 
35
  def parse_transcript(transcript: str) -> List[Segment]:
36
+ """
37
+ Parse transcript into segments.
38
+ Works with both formats:
39
+ - Speaker LastName 00:00:00
40
+ - **Speaker LastName** *00:00:00*
41
+ """
42
+ # Match both markdown and plain formats
43
+ pattern = r"(?:\*\*)?(?:Speaker\s+)?([A-Za-z]+)(?:\*\*)?\s+(?:\*)?([0-9:]+)(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker\s+)?[A-Za-z]+|\Z)"
44
 
45
+ segments = []
46
+ for i, match in enumerate(re.finditer(pattern, transcript, re.DOTALL)):
47
  speaker, timestamp, text = match.groups()
48
+ original_text = text.strip()
49
+ cleaned_text = clean_text_for_matching(original_text)
50
+ segments.append(Segment(speaker, timestamp, cleaned_text, original_text, i))
51
 
52
  return segments
53
 
54
+ def align_segments(auto_segments: List[Segment], human_segments: List[Segment]) -> Dict[int, int]:
55
+ """
56
+ Align segments from human-edited transcript to auto-generated transcript.
57
+ Returns a dictionary mapping human segment indices to auto segment indices.
58
+ """
59
+ alignments = {}
60
+
61
+ # Create text similarity matrix
62
+ similarity_matrix = np.zeros((len(human_segments), len(auto_segments)))
63
+
64
+ for h_idx, h_segment in enumerate(human_segments):
65
+ for a_idx, a_segment in enumerate(auto_segments):
66
+ similarity = difflib.SequenceMatcher(None, h_segment.text, a_segment.text).ratio()
67
+ similarity_matrix[h_idx, a_idx] = similarity
68
+
69
+ # Find best matches while maintaining order
70
+ remaining_auto_indices = set(range(len(auto_segments)))
71
+
72
+ for h_idx, h_segment in enumerate(human_segments):
73
+ # Find the best matching auto segment that hasn't been assigned yet
74
+ best_match = -1
75
+ best_similarity = 0.5 # Threshold for considering a match
76
 
77
+ for a_idx in remaining_auto_indices:
78
+ similarity = similarity_matrix[h_idx, a_idx]
 
 
 
 
 
79
 
80
+ if similarity > best_similarity:
81
+ # Check if this would violate sequence ordering
82
+ if all(aligned_a_idx < a_idx for aligned_h_idx, aligned_a_idx in alignments.items() if aligned_h_idx < h_idx):
83
+ best_match = a_idx
84
+ best_similarity = similarity
85
 
86
+ if best_match >= 0:
87
+ alignments[h_idx] = best_match
88
+ remaining_auto_indices.remove(best_match)
89
 
90
+ return alignments
91
 
92
+ def update_transcript(human_segments: List[Segment], auto_segments: List[Segment],
93
+ alignments: Dict[int, int], is_markdown: bool) -> str:
94
+ """
95
+ Create updated transcript by transferring timestamps from auto segments to human segments.
96
+ Preserves all human edits, formatting, links, etc.
97
+ """
98
+ updated_segments = []
99
+
100
+ for h_idx, h_segment in enumerate(human_segments):
101
+ if h_idx in alignments:
102
+ # Segment was matched, use timestamp from auto segment
103
+ a_idx = alignments[h_idx]
104
+
105
+ if is_markdown:
106
+ updated_segments.append(f"**{h_segment.speaker}** *{auto_segments[a_idx].timestamp}*\n\n{h_segment.original_text}")
107
+ else:
108
+ updated_segments.append(f"Speaker {h_segment.speaker} {auto_segments[a_idx].timestamp}\n\n{h_segment.original_text}")
 
 
 
 
 
109
  else:
110
+ # No match found, keep original timestamp but mark it
111
+ if is_markdown:
112
+ updated_segments.append(f"**{h_segment.speaker}** *{h_segment.timestamp} [NO MATCH]*\n\n{h_segment.original_text}")
113
+ else:
114
+ updated_segments.append(f"Speaker {h_segment.speaker} {h_segment.timestamp} [NO MATCH]\n\n{h_segment.original_text}")
115
 
116
+ return "\n\n".join(updated_segments)
 
 
 
 
 
117
 
118
+ def generate_match_report(human_segments: List[Segment], auto_segments: List[Segment],
119
+ alignments: Dict[int, int]) -> str:
120
+ """Generate a report about the matching process"""
121
+ total_human = len(human_segments)
122
+ total_auto = len(auto_segments)
123
+ total_matched = len(alignments)
124
+
125
+ report = f"### Matching Report\n\n"
126
+ report += f"- Human segments: {total_human}\n"
127
+ report += f"- Auto segments: {total_auto}\n"
128
+ report += f"- Matched segments: {total_matched} ({total_matched/total_human*100:.1f}%)\n"
129
+
130
+ if total_matched < total_human:
131
+ report += f"\n### Unmatched Segments ({total_human - total_matched})\n\n"
132
+ for h_idx, h_segment in enumerate(human_segments):
133
+ if h_idx not in alignments:
134
+ report += f"- Speaker {h_segment.speaker} at {h_segment.timestamp}: '{h_segment.text[:50]}...'\n"
135
+
136
+ # Calculate average similarity of matches
137
+ if alignments:
138
+ similarities = [
139
+ difflib.SequenceMatcher(None,
140
+ human_segments[h_idx].text,
141
+ auto_segments[a_idx].text).ratio()
142
+ for h_idx, a_idx in alignments.items()
143
+ ]
144
+ avg_similarity = sum(similarities) / len(similarities)
145
+ report += f"\n### Match Quality\n\n"
146
+ report += f"- Average similarity: {avg_similarity:.2f}\n"
147
+
148
+ return report
149
 
150
+ def process_transcripts(auto_transcript, human_transcript):
151
+ """Process the auto and human transcripts to update timestamps"""
152
+ try:
153
+ # Load transcripts
154
+ auto_content = auto_transcript.decode('utf-8') if isinstance(auto_transcript, bytes) else auto_transcript
155
+ human_content = human_transcript.decode('utf-8') if isinstance(human_transcript, bytes) else human_transcript
156
+
157
+ # Check if transcripts use markdown formatting
158
+ is_markdown = "**" in human_content
159
+
160
+ # Parse transcripts
161
+ auto_segments = parse_transcript(auto_content)
162
+ human_segments = parse_transcript(human_content)
163
+
164
+ if not auto_segments or not human_segments:
165
+ return "Error: Could not parse transcripts. Please check the format.", ""
166
+
167
+ # Align segments
168
+ alignments = align_segments(auto_segments, human_segments)
169
+
170
+ # Update transcript
171
+ updated_transcript = update_transcript(human_segments, auto_segments, alignments, is_markdown)
172
+
173
+ # Generate report
174
+ report = generate_match_report(human_segments, auto_segments, alignments)
175
+
176
+ return updated_transcript, report
177
 
178
+ except Exception as e:
179
+ return f"Error processing transcripts: {str(e)}", ""
180
 
181
+ def save_transcript(transcript: str) -> str:
182
+ """Save transcript to a temporary file and return the path"""
183
+ output_dir = "output"
184
+ if not os.path.exists(output_dir):
185
+ os.makedirs(output_dir)
186
+
187
+ output_path = os.path.join(output_dir, "updated_transcript.md")
188
+ with open(output_path, 'w', encoding='utf-8') as f:
189
+ f.write(transcript)
190
+
191
+ return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  # Create Gradio interface
194
+ with gr.Blocks(title="Transcript Timestamp Synchronizer") as demo:
195
  gr.Markdown("""
196
+ # 🎙️ Transcript Timestamp Synchronizer
197
 
198
+ This tool updates timestamps in human-edited transcripts based on new auto-generated transcripts.
199
 
200
  ## Instructions:
201
+ 1. Upload or paste your new auto-generated transcript (with updated timestamps)
202
+ 2. Upload or paste your human-edited transcript (with old timestamps)
203
+ 3. Click "Synchronize Timestamps" to generate an updated transcript
204
 
205
+ The tool will match segments between the transcripts and update the timestamps while preserving all human edits.
206
  """)
207
 
208
  with gr.Row():
209
  with gr.Column():
210
+ auto_source = gr.Radio(
211
+ ["Upload File", "Paste Text"],
212
+ label="Auto-generated Transcript Source",
213
+ value="Paste Text"
214
+ )
215
+ auto_file = gr.File(
216
+ label="Upload Auto-generated Transcript",
217
+ file_types=[".md", ".txt"],
218
+ visible=False
219
+ )
220
+ auto_text = gr.TextArea(
221
+ label="Auto-generated Transcript (with new timestamps)",
222
  placeholder="Paste the auto-generated transcript here...",
223
+ lines=15,
224
+ visible=True
225
  )
226
 
227
  with gr.Column():
228
+ human_source = gr.Radio(
229
+ ["Upload File", "Paste Text"],
230
+ label="Human-edited Transcript Source",
231
+ value="Paste Text"
232
+ )
233
+ human_file = gr.File(
234
+ label="Upload Human-edited Transcript",
235
+ file_types=[".md", ".txt"],
236
+ visible=False
237
+ )
238
+ human_text = gr.TextArea(
239
+ label="Human-edited Transcript (with old timestamps)",
240
  placeholder="Paste the human-edited transcript here...",
241
+ lines=15,
242
+ visible=True
243
  )
244
 
245
+ update_btn = gr.Button("Synchronize Timestamps")
246
 
247
  with gr.Tabs():
248
  with gr.TabItem("Updated Transcript"):
 
251
  placeholder="The updated transcript will appear here...",
252
  lines=20
253
  )
254
+ download_btn = gr.Button("Download Updated Transcript")
255
+ download_path = gr.File(label="Download", visible=False)
256
 
257
+ with gr.TabItem("Matching Report"):
258
+ matching_report = gr.Markdown(
259
+ label="Matching Report",
260
+ value="The matching report will appear here..."
261
  )
262
 
263
+ # Handle visibility of upload/paste options
264
+ def update_auto_visibility(choice):
265
+ return gr.update(visible=choice=="Upload File"), gr.update(visible=choice=="Paste Text")
266
+
267
+ def update_human_visibility(choice):
268
+ return gr.update(visible=choice=="Upload File"), gr.update(visible=choice=="Paste Text")
269
+
270
+ auto_source.change(update_auto_visibility, auto_source, [auto_file, auto_text])
271
+ human_source.change(update_human_visibility, human_source, [human_file, human_text])
272
+
273
+ # Load file content if uploaded
274
+ def load_auto_file(file):
275
+ if file is None:
276
+ return ""
277
+ with open(file.name, "r", encoding="utf-8") as f:
278
+ return f.read()
279
+
280
+ def load_human_file(file):
281
+ if file is None:
282
+ return ""
283
+ with open(file.name, "r", encoding="utf-8") as f:
284
+ return f.read()
285
+
286
+ auto_file.change(load_auto_file, auto_file, auto_text)
287
+ human_file.change(load_human_file, human_file, human_text)
288
+
289
+ # Process transcripts
290
+ def handle_process(auto_content, human_content):
291
+ return process_transcripts(auto_content, human_content)
292
+
293
  update_btn.click(
294
+ fn=handle_process,
295
+ inputs=[auto_text, human_text],
296
+ outputs=[updated_transcript, matching_report]
297
+ )
298
+
299
+ # Handle download
300
+ def prepare_download(transcript):
301
+ if not transcript:
302
+ return None
303
+ return save_transcript(transcript)
304
+
305
+ download_btn.click(
306
+ fn=prepare_download,
307
+ inputs=[updated_transcript],
308
+ outputs=[download_path]
309
  )
310
 
311
+ # For local testing
312
  if __name__ == "__main__":
313
  demo.launch()