dwarkesh commited on
Commit
8c39e27
Β·
verified Β·
1 Parent(s): fa0d8b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -22
app.py CHANGED
@@ -15,16 +15,11 @@ from datetime import datetime
15
 
16
  prompt = '''
17
  You are an expert transcript editor. Your task is to enhance this transcript for maximum readability while maintaining the core message.
18
-
19
  IMPORTANT: Respond ONLY with the enhanced transcript. Do not include any explanations, headers, or phrases like "Here is the transcript."
20
-
21
  Note: Below you'll find an auto-generated transcript that may help with speaker identification, but focus on creating your own high-quality transcript from the audio.
22
-
23
  Think about your job as if you were transcribing an interview for a print book where the priority is the reading audience. It should just be a total pleasure to read this as a written artifact where all the flubs and repetitions and conversational artifacts and filler words and false starts are removed, where a bunch of helpful punctuation is added. It should basically read like somebody wrote it specifically for reading rather than just something somebody said extemporaneously.
24
-
25
  Please:
26
  1. Fix speaker attribution errors, especially at segment boundaries. Watch for incomplete thoughts that were likely from the previous speaker.
27
-
28
  2. Optimize AGGRESSIVELY for readability over verbatim accuracy:
29
  - Readability is the most important thing!!
30
  - Remove ALL conversational artifacts (yeah, so, I mean, etc.)
@@ -34,7 +29,6 @@ Please:
34
  - Convert any indirect or rambling responses into direct statements
35
  - Break up run-on sentences into clear, concise statements
36
  - Maintain natural conversation flow while prioritizing clarity and directness
37
-
38
  3. Format the output consistently:
39
  - Keep the "Speaker X 00:00:00" format (no brackets, no other formatting)
40
  - DO NOT change the timestamps. You're only seeing a chunk of the full transcript, which is why your 0:00:00 is not the true beginning. Keep the timestamps as they are.
@@ -44,21 +38,14 @@ Please:
44
  - When you add paragraph breaks between the same speaker's remarks, no need to restate the speaker attribution
45
  - Don't go more than four sentences without adding a paragraph break. Be liberal with your paragraph breaks.
46
  - Preserve distinct speaker turns
47
-
48
  Example input:
49
  Speaker A 00:01:15
50
-
51
  Um, yeah, so like, I've been working on this new project at work, you know? And uh, what's really interesting is that, uh, we're seeing these amazing results with the new approach we're taking. Like, it's just, you know, it's really transforming how we do things.
52
-
53
  And then, I mean, the thing is, uh, when we showed it to the client last week, they were just, you know, completely blown away by what we achieved. Like, they couldn't even believe it was the same system they had before.
54
-
55
  Example output:
56
  Speaker A 00:01:15
57
-
58
  I've been working on this new project at work, and we're seeing amazing results with our new approach. It's really transforming how we do things.
59
-
60
  When we showed it to the client last week, they were completely blown away by what we achieved. They couldn't believe it was the same system they had before.
61
-
62
  Enhance the following transcript, starting directly with the speaker format:
63
  '''
64
 
@@ -144,7 +131,7 @@ class Transcriber:
144
  class Enhancer:
145
  def __init__(self, api_key: str):
146
  generativeai.configure(api_key=api_key)
147
- self.model = generativeai.GenerativeModel("gemini-2.0-flash-lite-preview-02-05")
148
  self.prompt = prompt
149
 
150
  async def enhance_chunks(self, chunks: List[Tuple[str, io.BytesIO]]) -> List[str]:
@@ -274,7 +261,7 @@ def create_downloadable_file(content: str, prefix: str) -> str:
274
  return str(filepath)
275
 
276
 
277
- def process_audio(audio_file):
278
  try:
279
  temp_path = Path("temp_audio")
280
  temp_path.mkdir(exist_ok=True)
@@ -293,7 +280,7 @@ def process_audio(audio_file):
293
  )
294
 
295
  # Get transcript
296
- transcriber = Transcriber(os.getenv("ASSEMBLYAI_API_KEY"))
297
  utterances = transcriber.get_transcript(temp_file)
298
  dialogues = list(group_utterances_by_speaker(utterances))
299
  original = format_chunk(dialogues, markdown=True)
@@ -310,7 +297,7 @@ def process_audio(audio_file):
310
  )
311
 
312
  try:
313
- enhancer = Enhancer(os.getenv("GOOGLE_API_KEY"))
314
  chunks = prepare_audio_chunks(temp_file, utterances)
315
  enhanced = asyncio.run(enhancer.enhance_chunks(chunks))
316
  merged = "\n\n".join(chunk.strip() for chunk in enhanced)
@@ -349,15 +336,31 @@ def process_audio(audio_file):
349
  # Create the Gradio interface
350
  with gr.Blocks(title="Transcript Enhancer") as demo:
351
  gr.Markdown("""
352
- # πŸŽ™οΈ Audio Transcript Enhancer
353
 
354
- Upload an audio file to get both an automated transcript and an enhanced version using AI.
355
 
356
  1. The original transcript is generated using AssemblyAI with speaker detection
357
  2. The enhanced version uses Google's Gemini to improve clarity and readability
358
  """)
359
 
360
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  audio_input = gr.File(
362
  label="Upload Audio File",
363
  type="binary",
@@ -366,7 +369,7 @@ with gr.Blocks(title="Transcript Enhancer") as demo:
366
  )
367
 
368
  with gr.Row():
369
- transcribe_btn = gr.Button("πŸ“ Transcribe & Enhance")
370
 
371
  with gr.Row():
372
  with gr.Column():
@@ -380,7 +383,7 @@ with gr.Blocks(title="Transcript Enhancer") as demo:
380
  original_output = gr.Markdown()
381
 
382
  with gr.Column():
383
- gr.Markdown("### Enhanced Transcript")
384
  enhanced_download = gr.File(
385
  label="Download as Markdown",
386
  file_count="single",
@@ -400,7 +403,7 @@ with gr.Blocks(title="Transcript Enhancer") as demo:
400
 
401
  transcribe_btn.click(
402
  fn=process_audio,
403
- inputs=[audio_input],
404
  outputs=[
405
  original_output,
406
  enhanced_output,
 
15
 
16
  prompt = '''
17
  You are an expert transcript editor. Your task is to enhance this transcript for maximum readability while maintaining the core message.
 
18
  IMPORTANT: Respond ONLY with the enhanced transcript. Do not include any explanations, headers, or phrases like "Here is the transcript."
 
19
  Note: Below you'll find an auto-generated transcript that may help with speaker identification, but focus on creating your own high-quality transcript from the audio.
 
20
  Think about your job as if you were transcribing an interview for a print book where the priority is the reading audience. It should just be a total pleasure to read this as a written artifact where all the flubs and repetitions and conversational artifacts and filler words and false starts are removed, where a bunch of helpful punctuation is added. It should basically read like somebody wrote it specifically for reading rather than just something somebody said extemporaneously.
 
21
  Please:
22
  1. Fix speaker attribution errors, especially at segment boundaries. Watch for incomplete thoughts that were likely from the previous speaker.
 
23
  2. Optimize AGGRESSIVELY for readability over verbatim accuracy:
24
  - Readability is the most important thing!!
25
  - Remove ALL conversational artifacts (yeah, so, I mean, etc.)
 
29
  - Convert any indirect or rambling responses into direct statements
30
  - Break up run-on sentences into clear, concise statements
31
  - Maintain natural conversation flow while prioritizing clarity and directness
 
32
  3. Format the output consistently:
33
  - Keep the "Speaker X 00:00:00" format (no brackets, no other formatting)
34
  - DO NOT change the timestamps. You're only seeing a chunk of the full transcript, which is why your 0:00:00 is not the true beginning. Keep the timestamps as they are.
 
38
  - When you add paragraph breaks between the same speaker's remarks, no need to restate the speaker attribution
39
  - Don't go more than four sentences without adding a paragraph break. Be liberal with your paragraph breaks.
40
  - Preserve distinct speaker turns
 
41
  Example input:
42
  Speaker A 00:01:15
 
43
  Um, yeah, so like, I've been working on this new project at work, you know? And uh, what's really interesting is that, uh, we're seeing these amazing results with the new approach we're taking. Like, it's just, you know, it's really transforming how we do things.
 
44
  And then, I mean, the thing is, uh, when we showed it to the client last week, they were just, you know, completely blown away by what we achieved. Like, they couldn't even believe it was the same system they had before.
 
45
  Example output:
46
  Speaker A 00:01:15
 
47
  I've been working on this new project at work, and we're seeing amazing results with our new approach. It's really transforming how we do things.
 
48
  When we showed it to the client last week, they were completely blown away by what we achieved. They couldn't believe it was the same system they had before.
 
49
  Enhance the following transcript, starting directly with the speaker format:
50
  '''
51
 
 
131
  class Enhancer:
132
  def __init__(self, api_key: str):
133
  generativeai.configure(api_key=api_key)
134
+ self.model = generativeai.GenerativeModel("gemini-2.5-pro-preview-03-25")
135
  self.prompt = prompt
136
 
137
  async def enhance_chunks(self, chunks: List[Tuple[str, io.BytesIO]]) -> List[str]:
 
261
  return str(filepath)
262
 
263
 
264
+ def process_audio(audio_file, google_api_key_input, assemblyai_api_key_input):
265
  try:
266
  temp_path = Path("temp_audio")
267
  temp_path.mkdir(exist_ok=True)
 
280
  )
281
 
282
  # Get transcript
283
+ transcriber = Transcriber(assemblyai_api_key_input)
284
  utterances = transcriber.get_transcript(temp_file)
285
  dialogues = list(group_utterances_by_speaker(utterances))
286
  original = format_chunk(dialogues, markdown=True)
 
297
  )
298
 
299
  try:
300
+ enhancer = Enhancer(google_api_key_input)
301
  chunks = prepare_audio_chunks(temp_file, utterances)
302
  enhanced = asyncio.run(enhancer.enhance_chunks(chunks))
303
  merged = "\n\n".join(chunk.strip() for chunk in enhanced)
 
336
  # Create the Gradio interface
337
  with gr.Blocks(title="Transcript Enhancer") as demo:
338
  gr.Markdown("""
339
+ # πŸŽ™οΈ Gemini Content Producer
340
 
341
+ Upload an audio file to get both an automated transcript and an enhanced version using Gemini.
342
 
343
  1. The original transcript is generated using AssemblyAI with speaker detection
344
  2. The enhanced version uses Google's Gemini to improve clarity and readability
345
  """)
346
 
347
  with gr.Row():
348
+ google_api_key_input = gr.Textbox(
349
+ label="Google API Key",
350
+ placeholder="Enter your Google API Key here",
351
+ type="password",
352
+ lines=1,
353
+ info="Your GCP account needs to have billing enabled to use the 2.5 pro model.",
354
+ scale=1
355
+ )
356
+ assemblyai_api_key_input = gr.Textbox(
357
+ label="AssemblyAI API Key",
358
+ placeholder="Enter your AssemblyAI API Key here",
359
+ type="password",
360
+ lines=1,
361
+ info="Your key is used for initial audio transcription.",
362
+ scale=1
363
+ )
364
  audio_input = gr.File(
365
  label="Upload Audio File",
366
  type="binary",
 
369
  )
370
 
371
  with gr.Row():
372
+ transcribe_btn = gr.Button("πŸ“ Transcribe & Gemini")
373
 
374
  with gr.Row():
375
  with gr.Column():
 
383
  original_output = gr.Markdown()
384
 
385
  with gr.Column():
386
+ gr.Markdown("### Gemini Transcript")
387
  enhanced_download = gr.File(
388
  label="Download as Markdown",
389
  file_count="single",
 
403
 
404
  transcribe_btn.click(
405
  fn=process_audio,
406
+ inputs=[audio_input, google_api_key_input, assemblyai_api_key_input],
407
  outputs=[
408
  original_output,
409
  enhanced_output,