dwarkesh commited on
Commit
5725925
·
verified ·
1 Parent(s): fb1eceb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -254
app.py CHANGED
@@ -1,248 +1,130 @@
1
  import gradio as gr
2
  import re
3
- import difflib
4
- import os
5
- from typing import List, Dict, Tuple, Optional
6
- from dataclasses import dataclass
7
- import numpy as np
8
 
9
- @dataclass
10
- class Segment:
11
- """A segment of a transcript with a speaker and text"""
12
- speaker: str
13
- timestamp: str
14
- text: str
15
- original_text: str # The text as it appears in the original transcript
16
- index: int # Position in the original transcript
17
-
18
- def clean_text_for_matching(text: str) -> str:
19
- """Clean text for matching purposes (remove formatting, punctuation, etc.)"""
20
- # Remove markdown links and formatting
21
- text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) # Replace markdown links with just the text
22
- text = re.sub(r'\*\*|\*', '', text) # Remove bold and italic formatting
23
-
24
- # Remove common filler words and punctuation for better matching
25
- text = re.sub(r'[,.;:!?]', ' ', text)
26
- text = re.sub(r'\s+', ' ', text)
27
-
28
- return text.lower().strip()
29
-
30
- def load_transcript_file(file_path: str) -> str:
31
- """Load transcript from a file"""
32
- with open(file_path, 'r', encoding='utf-8') as f:
33
- return f.read()
34
-
35
- def parse_transcript(transcript: str) -> List[Segment]:
36
  """
37
- Parse transcript into segments.
38
- Works with both formats:
39
- - Speaker LastName 00:00:00
40
- - **Speaker LastName** *00:00:00*
41
  """
42
- # Match both markdown and plain formats
43
- pattern = r"(?:\*\*)?(?:Speaker\s+)?([A-Za-z]+)(?:\*\*)?\s+(?:\*)?([0-9:]+)(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker\s+)?[A-Za-z]+|\Z)"
44
-
45
  segments = []
46
- for i, match in enumerate(re.finditer(pattern, transcript, re.DOTALL)):
 
47
  speaker, timestamp, text = match.groups()
48
- original_text = text.strip()
49
- cleaned_text = clean_text_for_matching(original_text)
50
- segments.append(Segment(speaker, timestamp, cleaned_text, original_text, i))
51
 
52
  return segments
53
 
54
- def align_segments(auto_segments: List[Segment], human_segments: List[Segment]) -> Dict[int, int]:
55
- """
56
- Align segments from human-edited transcript to auto-generated transcript.
57
- Returns a dictionary mapping human segment indices to auto segment indices.
58
  """
59
- alignments = {}
60
-
61
- # Create text similarity matrix
62
- similarity_matrix = np.zeros((len(human_segments), len(auto_segments)))
63
-
64
- for h_idx, h_segment in enumerate(human_segments):
65
- for a_idx, a_segment in enumerate(auto_segments):
66
- similarity = difflib.SequenceMatcher(None, h_segment.text, a_segment.text).ratio()
67
- similarity_matrix[h_idx, a_idx] = similarity
68
 
69
- # Find best matches while maintaining order
70
- remaining_auto_indices = set(range(len(auto_segments)))
71
-
72
- for h_idx, h_segment in enumerate(human_segments):
73
- # Find the best matching auto segment that hasn't been assigned yet
74
- best_match = -1
75
- best_similarity = 0.5 # Threshold for considering a match
76
-
77
- for a_idx in remaining_auto_indices:
78
- similarity = similarity_matrix[h_idx, a_idx]
79
-
80
- if similarity > best_similarity:
81
- # Check if this would violate sequence ordering
82
- if all(aligned_a_idx < a_idx for aligned_h_idx, aligned_a_idx in alignments.items() if aligned_h_idx < h_idx):
83
- best_match = a_idx
84
- best_similarity = similarity
85
-
86
- if best_match >= 0:
87
- alignments[h_idx] = best_match
88
- remaining_auto_indices.remove(best_match)
89
-
90
- return alignments
91
 
92
- def update_transcript(human_segments: List[Segment], auto_segments: List[Segment],
93
- alignments: Dict[int, int], is_markdown: bool) -> str:
94
  """
95
- Create updated transcript by transferring timestamps from auto segments to human segments.
96
- Preserves all human edits, formatting, links, etc.
97
  """
98
- updated_segments = []
 
 
99
 
100
- for h_idx, h_segment in enumerate(human_segments):
101
- if h_idx in alignments:
102
- # Segment was matched, use timestamp from auto segment
103
- a_idx = alignments[h_idx]
104
-
105
- if is_markdown:
106
- updated_segments.append(f"**{h_segment.speaker}** *{auto_segments[a_idx].timestamp}*\n\n{h_segment.original_text}")
107
- else:
108
- updated_segments.append(f"Speaker {h_segment.speaker} {auto_segments[a_idx].timestamp}\n\n{h_segment.original_text}")
109
- else:
110
- # No match found, keep original timestamp but mark it
111
- if is_markdown:
112
- updated_segments.append(f"**{h_segment.speaker}** *{h_segment.timestamp} [NO MATCH]*\n\n{h_segment.original_text}")
113
- else:
114
- updated_segments.append(f"Speaker {h_segment.speaker} {h_segment.timestamp} [NO MATCH]\n\n{h_segment.original_text}")
115
-
116
- return "\n\n".join(updated_segments)
117
-
118
- def generate_match_report(human_segments: List[Segment], auto_segments: List[Segment],
119
- alignments: Dict[int, int]) -> str:
120
- """Generate a report about the matching process"""
121
- total_human = len(human_segments)
122
- total_auto = len(auto_segments)
123
- total_matched = len(alignments)
124
 
125
- report = f"### Matching Report\n\n"
126
- report += f"- Human segments: {total_human}\n"
127
- report += f"- Auto segments: {total_auto}\n"
128
- report += f"- Matched segments: {total_matched} ({total_matched/total_human*100:.1f}%)\n"
129
 
130
- if total_matched < total_human:
131
- report += f"\n### Unmatched Segments ({total_human - total_matched})\n\n"
132
- for h_idx, h_segment in enumerate(human_segments):
133
- if h_idx not in alignments:
134
- report += f"- Speaker {h_segment.speaker} at {h_segment.timestamp}: '{h_segment.text[:50]}...'\n"
135
 
136
- # Calculate average similarity of matches
137
- if alignments:
138
- similarities = [
139
- difflib.SequenceMatcher(None,
140
- human_segments[h_idx].text,
141
- auto_segments[a_idx].text).ratio()
142
- for h_idx, a_idx in alignments.items()
143
- ]
144
- avg_similarity = sum(similarities) / len(similarities)
145
- report += f"\n### Match Quality\n\n"
146
- report += f"- Average similarity: {avg_similarity:.2f}\n"
147
-
148
- return report
149
-
150
- def process_transcripts(auto_transcript, human_transcript):
151
- """Process the auto and human transcripts to update timestamps"""
152
- try:
153
- # Load transcripts
154
- auto_content = auto_transcript.decode('utf-8') if isinstance(auto_transcript, bytes) else auto_transcript
155
- human_content = human_transcript.decode('utf-8') if isinstance(human_transcript, bytes) else human_transcript
156
 
157
- # Check if transcripts use markdown formatting
158
- is_markdown = "**" in human_content
159
 
160
- # Parse transcripts
161
- auto_segments = parse_transcript(auto_content)
162
- human_segments = parse_transcript(human_content)
163
 
164
- if not auto_segments or not human_segments:
165
- return "Error: Could not parse transcripts. Please check the format.", ""
166
-
167
- # Align segments
168
- alignments = align_segments(auto_segments, human_segments)
169
-
170
- # Update transcript
171
- updated_transcript = update_transcript(human_segments, auto_segments, alignments, is_markdown)
172
-
173
- # Generate report
174
- report = generate_match_report(human_segments, auto_segments, alignments)
175
 
176
- return updated_transcript, report
 
177
 
178
- except Exception as e:
179
- return f"Error processing transcripts: {str(e)}", ""
180
-
181
- def save_transcript(transcript: str) -> str:
182
- """Save transcript to a temporary file and return the path"""
183
- output_dir = "output"
184
- if not os.path.exists(output_dir):
185
- os.makedirs(output_dir)
186
 
187
- output_path = os.path.join(output_dir, "updated_transcript.md")
188
- with open(output_path, 'w', encoding='utf-8') as f:
189
- f.write(transcript)
190
 
191
- return output_path
192
 
193
  # Create Gradio interface
194
- with gr.Blocks(title="Transcript Timestamp Synchronizer") as demo:
195
  gr.Markdown("""
196
- # 🎙️ Transcript Timestamp Synchronizer
197
 
198
- This tool updates timestamps in human-edited transcripts based on new auto-generated transcripts.
199
 
200
  ## Instructions:
201
- 1. Upload or paste your new auto-generated transcript (with updated timestamps)
202
- 2. Upload or paste your human-edited transcript (with old timestamps)
203
- 3. Click "Synchronize Timestamps" to generate an updated transcript
204
 
205
- The tool will match segments between the transcripts and update the timestamps while preserving all human edits.
206
  """)
207
 
208
  with gr.Row():
209
  with gr.Column():
210
- auto_source = gr.Radio(
211
- ["Upload File", "Paste Text"],
212
- label="Auto-generated Transcript Source",
213
- value="Paste Text"
214
- )
215
- auto_file = gr.File(
216
- label="Upload Auto-generated Transcript",
217
- file_types=[".md", ".txt"],
218
- visible=False
219
- )
220
- auto_text = gr.TextArea(
221
- label="Auto-generated Transcript (with new timestamps)",
222
  placeholder="Paste the auto-generated transcript here...",
223
- lines=15,
224
- visible=True
225
  )
226
 
227
  with gr.Column():
228
- human_source = gr.Radio(
229
- ["Upload File", "Paste Text"],
230
- label="Human-edited Transcript Source",
231
- value="Paste Text"
232
- )
233
- human_file = gr.File(
234
- label="Upload Human-edited Transcript",
235
- file_types=[".md", ".txt"],
236
- visible=False
237
- )
238
- human_text = gr.TextArea(
239
- label="Human-edited Transcript (with old timestamps)",
240
- placeholder="Paste the human-edited transcript here...",
241
- lines=15,
242
- visible=True
243
  )
244
 
245
- update_btn = gr.Button("Synchronize Timestamps")
246
 
247
  with gr.Tabs():
248
  with gr.TabItem("Updated Transcript"):
@@ -251,63 +133,19 @@ with gr.Blocks(title="Transcript Timestamp Synchronizer") as demo:
251
  placeholder="The updated transcript will appear here...",
252
  lines=20
253
  )
254
- download_btn = gr.Button("Download Updated Transcript")
255
- download_path = gr.File(label="Download", visible=False)
256
 
257
- with gr.TabItem("Matching Report"):
258
- matching_report = gr.Markdown(
259
- label="Matching Report",
260
- value="The matching report will appear here..."
261
  )
262
 
263
- # Handle visibility of upload/paste options
264
- def update_auto_visibility(choice):
265
- return gr.update(visible=choice=="Upload File"), gr.update(visible=choice=="Paste Text")
266
-
267
- def update_human_visibility(choice):
268
- return gr.update(visible=choice=="Upload File"), gr.update(visible=choice=="Paste Text")
269
-
270
- auto_source.change(update_auto_visibility, auto_source, [auto_file, auto_text])
271
- human_source.change(update_human_visibility, human_source, [human_file, human_text])
272
-
273
- # Load file content if uploaded
274
- def load_auto_file(file):
275
- if file is None:
276
- return ""
277
- with open(file.name, "r", encoding="utf-8") as f:
278
- return f.read()
279
-
280
- def load_human_file(file):
281
- if file is None:
282
- return ""
283
- with open(file.name, "r", encoding="utf-8") as f:
284
- return f.read()
285
-
286
- auto_file.change(load_auto_file, auto_file, auto_text)
287
- human_file.change(load_human_file, human_file, human_text)
288
-
289
- # Process transcripts
290
- def handle_process(auto_content, human_content):
291
- return process_transcripts(auto_content, human_content)
292
-
293
  update_btn.click(
294
- fn=handle_process,
295
- inputs=[auto_text, human_text],
296
- outputs=[updated_transcript, matching_report]
297
- )
298
-
299
- # Handle download
300
- def prepare_download(transcript):
301
- if not transcript:
302
- return None
303
- return save_transcript(transcript)
304
-
305
- download_btn.click(
306
- fn=prepare_download,
307
- inputs=[updated_transcript],
308
- outputs=[download_path]
309
  )
310
 
311
- # For local testing
312
  if __name__ == "__main__":
313
  demo.launch()
 
1
  import gradio as gr
2
  import re
3
+ from typing import List, Dict, Tuple
 
 
 
 
4
 
5
+ def extract_segments(transcript):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  """
7
+ Extract segments from a transcript.
8
+ Returns a list of tuples: (speaker, timestamp, text)
 
 
9
  """
10
+ pattern = r"(?:\*\*)?([A-Za-z]+)(?:\*\*)?\s+\*?([0-9:]+)\*?\s*\n\n(.*?)(?=\n\n(?:\*\*)?[A-Za-z]+|\Z)"
 
 
11
  segments = []
12
+
13
+ for match in re.finditer(pattern, transcript, re.DOTALL):
14
  speaker, timestamp, text = match.groups()
15
+ segments.append((speaker, timestamp, text.strip()))
 
 
16
 
17
  return segments
18
 
19
+ def find_matching_segments(auto_segments, human_segments):
 
 
 
20
  """
21
+ Find matching segments between auto and human transcripts.
22
+ Returns a dictionary mapping human segment index to auto segment index.
 
 
 
 
 
 
 
23
 
24
+ Very simple matching based on speaker sequence - assumes both transcripts
25
+ have the same speakers in the same order, just with different timestamps.
26
+ """
27
+ matches = {}
28
+
29
+ # Group segments by speaker
30
+ auto_by_speaker = {}
31
+ for i, (speaker, _, _) in enumerate(auto_segments):
32
+ if speaker not in auto_by_speaker:
33
+ auto_by_speaker[speaker] = []
34
+ auto_by_speaker[speaker].append(i)
35
+
36
+ # Match segments by speaker order
37
+ for h_idx, (speaker, _, _) in enumerate(human_segments):
38
+ if speaker in auto_by_speaker and auto_by_speaker[speaker]:
39
+ # Get the next available segment for this speaker
40
+ matches[h_idx] = auto_by_speaker[speaker].pop(0)
41
+
42
+ return matches
 
 
 
43
 
44
+ def update_timestamps(human_transcript, auto_transcript):
 
45
  """
46
+ Update timestamps in human transcript using timestamps from auto transcript.
47
+ Preserves all human edits and formatting.
48
  """
49
+ # Extract segments from both transcripts
50
+ human_segments = extract_segments(human_transcript)
51
+ auto_segments = extract_segments(auto_transcript)
52
 
53
+ if not human_segments or not auto_segments:
54
+ return "Error: Could not parse transcripts. Check formatting.", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ # Find matching segments
57
+ matches = find_matching_segments(auto_segments, human_segments)
 
 
58
 
59
+ # Create updated transcript
60
+ updated_transcript = human_transcript
 
 
 
61
 
62
+ # Replace timestamps in reverse order to avoid position shifts
63
+ for h_idx in sorted(matches.keys(), reverse=True):
64
+ a_idx = matches[h_idx]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ h_speaker, h_timestamp, _ = human_segments[h_idx]
67
+ _, a_timestamp, _ = auto_segments[a_idx]
68
 
69
+ # Determine if markdown is used
70
+ is_markdown = "**" in human_transcript
 
71
 
72
+ # Create patterns to match the timestamp in the original text
73
+ if is_markdown:
74
+ # For markdown format: **Speaker** *00:00:00*
75
+ pattern = fr"\*\*{h_speaker}\*\*\s+\*{h_timestamp}\*"
76
+ replacement = f"**{h_speaker}** *{a_timestamp}*"
77
+ else:
78
+ # For plain format: Speaker 00:00:00
79
+ pattern = fr"{h_speaker}\s+{h_timestamp}"
80
+ replacement = f"{h_speaker} {a_timestamp}"
 
 
81
 
82
+ # Replace the timestamp in the transcript
83
+ updated_transcript = re.sub(pattern, replacement, updated_transcript, 1)
84
 
85
+ # Generate report
86
+ report = f"### Timestamp Update Report\n\n"
87
+ report += f"- Human segments: {len(human_segments)}\n"
88
+ report += f"- Auto segments: {len(auto_segments)}\n"
89
+ report += f"- Updated timestamps: {len(matches)}\n"
 
 
 
90
 
91
+ if len(matches) < len(human_segments):
92
+ unmatched = len(human_segments) - len(matches)
93
+ report += f"- Segments not updated: {unmatched}\n"
94
 
95
+ return updated_transcript, report
96
 
97
  # Create Gradio interface
98
+ with gr.Blocks(title="Simple Transcript Timestamp Updater") as demo:
99
  gr.Markdown("""
100
+ # 🎙️ Simple Transcript Timestamp Updater
101
 
102
+ This tool updates timestamps in a human-edited transcript based on an auto-generated transcript.
103
 
104
  ## Instructions:
105
+ 1. Paste your auto-generated transcript (with correct timestamps)
106
+ 2. Paste your human-edited transcript (with old timestamps)
107
+ 3. Click "Update Timestamps"
108
 
109
+ The tool will update only the timestamps while preserving all human edits.
110
  """)
111
 
112
  with gr.Row():
113
  with gr.Column():
114
+ auto_transcript = gr.Textbox(
115
+ label="Auto-Generated Transcript (with correct timestamps)",
 
 
 
 
 
 
 
 
 
 
116
  placeholder="Paste the auto-generated transcript here...",
117
+ lines=15
 
118
  )
119
 
120
  with gr.Column():
121
+ human_transcript = gr.Textbox(
122
+ label="Human-Edited Transcript (with old timestamps)",
123
+ placeholder="Paste your human-edited transcript here...",
124
+ lines=15
 
 
 
 
 
 
 
 
 
 
 
125
  )
126
 
127
+ update_btn = gr.Button("Update Timestamps")
128
 
129
  with gr.Tabs():
130
  with gr.TabItem("Updated Transcript"):
 
133
  placeholder="The updated transcript will appear here...",
134
  lines=20
135
  )
 
 
136
 
137
+ with gr.TabItem("Report"):
138
+ report = gr.Markdown(
139
+ label="Report",
140
+ value="Report will appear here..."
141
  )
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  update_btn.click(
144
+ fn=update_timestamps,
145
+ inputs=[human_transcript, auto_transcript],
146
+ outputs=[updated_transcript, report]
 
 
 
 
 
 
 
 
 
 
 
 
147
  )
148
 
149
+ # Launch the app
150
  if __name__ == "__main__":
151
  demo.launch()