dwarkesh commited on
Commit
4322c44
·
verified ·
1 Parent(s): c0cc6fd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +265 -0
app.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+ import difflib
4
+ from typing import List, Dict, Tuple, Optional
5
+ import numpy as np
6
+ from dataclasses import dataclass
7
+
8
+ @dataclass
9
+ class Segment:
10
+ """Represents a transcript segment"""
11
+ speaker: str
12
+ timestamp: str
13
+ text: str
14
+ raw_text: str # For matching purposes - original text without formatting
15
+
16
+ @dataclass
17
+ class Match:
18
+ """Represents a match between segments"""
19
+ auto_index: int
20
+ human_index: int
21
+ similarity: float
22
+
23
+ def parse_auto_transcript(transcript: str) -> List[Segment]:
24
+ """Parse the auto-generated transcript"""
25
+ # Pattern to match "Speaker X 00:00:00" followed by text
26
+ pattern = r"(?:\*\*)?Speaker (\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?Speaker |\Z)"
27
+ segments = []
28
+
29
+ for match in re.finditer(pattern, transcript, re.DOTALL):
30
+ speaker, timestamp, text = match.groups()
31
+ # Remove any markdown formatting for matching purposes
32
+ raw_text = re.sub(r'\*\*|\*', '', text.strip())
33
+ segments.append(Segment(speaker, timestamp, text.strip(), raw_text))
34
+
35
+ return segments
36
+
37
+ def parse_human_transcript(transcript: str) -> List[Segment]:
38
+ """Parse the human-edited transcript"""
39
+ # Pattern to match both markdown and plain text formats
40
+ # This handles both "**Speaker X** *00:00:00*" and "Speaker X 00:00:00"
41
+ pattern = r"(?:\*\*)?(?:Speaker )?(\w+)(?:\*\*)? (?:\*)?(\d{2}:\d{2}:\d{2})(?:\*)?\s*\n\n(.*?)(?=\n\n(?:\*\*)?(?:Speaker )?|\Z)"
42
+ segments = []
43
+
44
+ for match in re.finditer(pattern, transcript, re.DOTALL):
45
+ speaker, timestamp, text = match.groups()
46
+ # Remove any markdown formatting for matching purposes
47
+ raw_text = re.sub(r'\*\*|\*|\[.*?\]\(.*?\)', '', text.strip())
48
+ segments.append(Segment(speaker, timestamp, text.strip(), raw_text))
49
+
50
+ return segments
51
+
52
+ def similarity_score(text1: str, text2: str) -> float:
53
+ """Calculate similarity between two text segments"""
54
+ # Remove all markdown, punctuation, and lowercase for better matching
55
+ clean1 = re.sub(r'[^\w\s]', '', text1.lower())
56
+ clean2 = re.sub(r'[^\w\s]', '', text2.lower())
57
+
58
+ # Use difflib's SequenceMatcher for similarity
59
+ return difflib.SequenceMatcher(None, clean1, clean2).ratio()
60
+
61
+ def find_best_matches(auto_segments: List[Segment], human_segments: List[Segment]) -> List[Match]:
62
+ """Find the best matching segments between auto and human transcripts"""
63
+ matches = []
64
+ used_human_indices = set()
65
+
66
+ # First pass: Find obvious matches (high similarity)
67
+ for auto_idx, auto_segment in enumerate(auto_segments):
68
+ best_match_idx = -1
69
+ best_similarity = 0.0
70
+
71
+ for human_idx, human_segment in enumerate(human_segments):
72
+ if human_idx in used_human_indices:
73
+ continue
74
+
75
+ similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text)
76
+
77
+ if similarity > best_similarity and similarity >= 0.6: # Threshold for a good match
78
+ best_similarity = similarity
79
+ best_match_idx = human_idx
80
+
81
+ if best_match_idx >= 0:
82
+ matches.append(Match(auto_idx, best_match_idx, best_similarity))
83
+ used_human_indices.add(best_match_idx)
84
+
85
+ # Second pass: Try to match remaining segments with a lower threshold
86
+ for auto_idx, auto_segment in enumerate(auto_segments):
87
+ if any(m.auto_index == auto_idx for m in matches):
88
+ continue
89
+
90
+ best_match_idx = -1
91
+ best_similarity = 0.0
92
+
93
+ for human_idx, human_segment in enumerate(human_segments):
94
+ if human_idx in used_human_indices:
95
+ continue
96
+
97
+ similarity = similarity_score(auto_segment.raw_text, human_segment.raw_text)
98
+
99
+ if similarity > best_similarity and similarity >= 0.4: # Lower threshold
100
+ best_similarity = similarity
101
+ best_match_idx = human_idx
102
+
103
+ if best_match_idx >= 0:
104
+ matches.append(Match(auto_idx, best_match_idx, best_similarity))
105
+ used_human_indices.add(best_match_idx)
106
+
107
+ return matches
108
+
109
+ def update_timestamps(auto_segments: List[Segment], human_segments: List[Segment], matches: List[Match]) -> str:
110
+ """Update timestamps in human transcript based on matches"""
111
+ # Create a new list for the updated segments
112
+ updated_segments = human_segments.copy()
113
+
114
+ for match in matches:
115
+ auto_segment = auto_segments[match.auto_index]
116
+ human_segment = human_segments[match.human_index]
117
+
118
+ # Update the timestamp in the human segment
119
+ updated_segments[match.human_index] = Segment(
120
+ speaker=human_segment.speaker,
121
+ timestamp=auto_segment.timestamp,
122
+ text=human_segment.text,
123
+ raw_text=human_segment.raw_text
124
+ )
125
+
126
+ # Generate the updated transcript
127
+ result = []
128
+ for segment in updated_segments:
129
+ # Check if this is a markdown-formatted transcript
130
+ if "**" in human_segments[0].text or "*" in human_segments[0].timestamp:
131
+ result.append(f"**{segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
132
+ else:
133
+ result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
134
+
135
+ return "\n\n".join(result)
136
+
137
+ def find_unmatched_segments(auto_segments: List[Segment], matches: List[Match]) -> List[int]:
138
+ """Find segments in the auto transcript that weren't matched"""
139
+ matched_auto_indices = {match.auto_index for match in matches}
140
+ return [i for i in range(len(auto_segments)) if i not in matched_auto_indices]
141
+
142
+ def format_unmatched_segments(auto_segments: List[Segment], unmatched_indices: List[int], is_markdown: bool) -> str:
143
+ """Format unmatched segments for display"""
144
+ if not unmatched_indices:
145
+ return "No unmatched segments found"
146
+
147
+ result = []
148
+ for idx in unmatched_indices:
149
+ segment = auto_segments[idx]
150
+ if is_markdown:
151
+ result.append(f"**Speaker {segment.speaker}** *{segment.timestamp}*\n\n{segment.text}")
152
+ else:
153
+ result.append(f"Speaker {segment.speaker} {segment.timestamp}\n\n{segment.text}")
154
+
155
+ return "### Unmatched Segments (New Content)\n\n" + "\n\n".join(result)
156
+
157
+ def process_transcripts(auto_transcript: str, human_transcript: str):
158
+ """Process transcripts and update timestamps"""
159
+ # Parse both transcripts
160
+ auto_segments = parse_auto_transcript(auto_transcript)
161
+ human_segments = parse_human_transcript(human_transcript)
162
+
163
+ # Early check for empty inputs
164
+ if not auto_segments or not human_segments:
165
+ return "Error: Could not parse one or both transcripts. Please check the format.", "", ""
166
+
167
+ # Find matches between segments
168
+ matches = find_best_matches(auto_segments, human_segments)
169
+
170
+ # Find unmatched segments
171
+ unmatched_indices = find_unmatched_segments(auto_segments, matches)
172
+
173
+ # Determine if we're using markdown
174
+ is_markdown = "**" in human_transcript or "*" in human_transcript
175
+
176
+ # Update timestamps
177
+ updated_transcript = update_timestamps(auto_segments, human_segments, matches)
178
+
179
+ # Format unmatched segments
180
+ unmatched_segments = format_unmatched_segments(auto_segments, unmatched_indices, is_markdown)
181
+
182
+ # Stats about the matching
183
+ stats = f"### Matching Statistics\n\n"
184
+ stats += f"- Auto-generated segments: {len(auto_segments)}\n"
185
+ stats += f"- Human-edited segments: {len(human_segments)}\n"
186
+ stats += f"- Matched segments: {len(matches)}\n"
187
+ stats += f"- Unmatched segments: {len(unmatched_indices)}\n"
188
+
189
+ # Add match quality histogram
190
+ if matches:
191
+ similarities = [match.similarity for match in matches]
192
+ stats += f"- Average match similarity: {sum(similarities)/len(similarities):.2f}\n"
193
+
194
+ # Histogram of match qualities
195
+ bins = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
196
+ hist, _ = np.histogram(similarities, bins=bins)
197
+ stats += "\n#### Match Quality Distribution\n\n"
198
+ for i, count in enumerate(hist):
199
+ lower = bins[i]
200
+ upper = bins[i+1]
201
+ stats += f"- {lower:.1f}-{upper:.1f}: {count} matches\n"
202
+
203
+ return updated_transcript, unmatched_segments, stats
204
+
205
+ # Create Gradio interface
206
+ with gr.Blocks(title="Transcript Timestamp Updater") as demo:
207
+ gr.Markdown("""
208
+ # Transcript Timestamp Updater
209
+
210
+ This tool updates timestamps in a human-edited transcript based on a new auto-generated transcript.
211
+
212
+ ## Instructions:
213
+ 1. Paste your new auto-generated transcript (with updated timestamps)
214
+ 2. Paste your human-edited transcript (with old timestamps)
215
+ 3. Click "Update Timestamps" to generate a new version of the human-edited transcript with updated timestamps
216
+
217
+ The tool will try to match segments between the two transcripts and update the timestamps accordingly.
218
+ """)
219
+
220
+ with gr.Row():
221
+ with gr.Column():
222
+ auto_transcript = gr.Textbox(
223
+ label="New Auto-Generated Transcript (with updated timestamps)",
224
+ placeholder="Paste the new auto-generated transcript here...",
225
+ lines=15
226
+ )
227
+
228
+ with gr.Column():
229
+ human_transcript = gr.Textbox(
230
+ label="Human-Edited Transcript (with old timestamps)",
231
+ placeholder="Paste your human-edited transcript here...",
232
+ lines=15
233
+ )
234
+
235
+ update_btn = gr.Button("Update Timestamps")
236
+
237
+ with gr.Tabs():
238
+ with gr.TabItem("Updated Transcript"):
239
+ updated_transcript = gr.TextArea(
240
+ label="Updated Human Transcript",
241
+ placeholder="The updated transcript will appear here...",
242
+ lines=20
243
+ )
244
+
245
+ with gr.TabItem("Unmatched Segments"):
246
+ unmatched_segments = gr.Markdown(
247
+ label="Unmatched Segments",
248
+ value="Unmatched segments will appear here..."
249
+ )
250
+
251
+ with gr.TabItem("Statistics"):
252
+ stats = gr.Markdown(
253
+ label="Matching Statistics",
254
+ value="Statistics will appear here..."
255
+ )
256
+
257
+ update_btn.click(
258
+ fn=process_transcripts,
259
+ inputs=[auto_transcript, human_transcript],
260
+ outputs=[updated_transcript, unmatched_segments, stats]
261
+ )
262
+
263
+ # Launch the app
264
+ if __name__ == "__main__":
265
+ demo.launch()