youngtsai commited on
Commit
1630bbe
·
1 Parent(s): 48d17f9

adjusted_segments

Browse files
Files changed (2) hide show
  1. app.py +44 -21
  2. requirements.txt +2 -1
app.py CHANGED
@@ -8,6 +8,9 @@ from openai import OpenAI
8
  from groq import Groq
9
  import uuid
10
  from gtts import gTTS
 
 
 
11
 
12
  from youtube_transcript_api import YouTubeTranscriptApi
13
  from youtube_transcript_api._errors import NoTranscriptFound
@@ -377,32 +380,52 @@ def generate_transcription(video_id):
377
  }],
378
  'outtmpl': outtmpl,
379
  }
380
- print("===download video mp3===")
381
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
382
  ydl.download([youtube_url])
383
 
384
  audio_path = f"{OUTPUT_PATH}/{video_id}.{codec_name}"
385
- print("===transcription by open ai===")
386
- with open(audio_path, "rb") as audio_file:
387
- srt_content = OPEN_AI_CLIENT.audio.transcriptions.create(
388
- model="whisper-1",
389
- file=audio_file,
390
- response_format="verbose_json",
391
- timestamp_granularities=["segment"],
392
- prompt="Transcribe the following audio file. if chinese, please using 'language: zh-TW' ",
393
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
- # get segments
396
- segments = srt_content.segments
397
- transcription = [
398
- {
399
- "text": item["text"],
400
- "start": int(item["start"]),
401
- "duration": int(item["end"] - item["start"])
402
- }
403
- for item in segments
404
- ]
405
- return transcription
406
 
407
  def process_transcript_and_screenshots(video_id):
408
  print("====process_transcript_and_screenshots====")
 
8
  from groq import Groq
9
  import uuid
10
  from gtts import gTTS
11
+ import math
12
+ from pydub import AudioSegment
13
+
14
 
15
  from youtube_transcript_api import YouTubeTranscriptApi
16
  from youtube_transcript_api._errors import NoTranscriptFound
 
380
  }],
381
  'outtmpl': outtmpl,
382
  }
383
+
384
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
385
  ydl.download([youtube_url])
386
 
387
  audio_path = f"{OUTPUT_PATH}/{video_id}.{codec_name}"
388
+ full_audio = AudioSegment.from_mp3(audio_path)
389
+
390
+ max_part_duration = 10 * 60 * 1000 # 10 minutes
391
+ full_duration = len(full_audio) # in milliseconds
392
+ parts = math.ceil(full_duration / max_part_duration)
393
+ print(f"parts: {parts}")
394
+ transcription = []
395
+
396
+ for i in range(parts):
397
+ print(f"== i: {i}==")
398
+ start_time = i * max_part_duration
399
+ end_time = min((i + 1) * max_part_duration, full_duration)
400
+ print(f"time: {start_time/1000} - {end_time/1000}")
401
+ chunk = full_audio[start_time:end_time]
402
+ chunk_path = f"{OUTPUT_PATH}/{video_id}_part_{i}.{codec_name}"
403
+ chunk.export(chunk_path, format=codec_name)
404
+
405
+ with open(chunk_path, "rb") as chunk_file:
406
+ response = OPEN_AI_CLIENT.audio.transcriptions.create(
407
+ model="whisper-1",
408
+ file=chunk_file,
409
+ response_format="verbose_json",
410
+ timestamp_granularities=["segment"],
411
+ prompt="Transcribe the following audio file. if chinese, please using 'language: zh-TW' ",
412
+ )
413
+
414
+ # Adjusting the timestamps for the chunk based on its position in the full audio
415
+ adjusted_segments = [{
416
+ 'text': segment['text'],
417
+ 'start': math.ceil(segment['start'] + start_time / 1000.0), # Converting milliseconds to seconds
418
+ 'end': math.ceil(segment['end'] + start_time / 1000.0),
419
+ 'duration': math.ceil(segment['end'] - segment['start'])
420
+ } for segment in response.segments]
421
 
422
+ transcription.extend(adjusted_segments)
423
+
424
+ # Remove temporary chunk files after processing
425
+ os.remove(chunk_path)
426
+
427
+ return transcription
428
+
 
 
 
 
429
 
430
  def process_transcript_and_screenshots(video_id):
431
  print("====process_transcript_and_screenshots====")
requirements.txt CHANGED
@@ -16,4 +16,5 @@ groq
16
  yt_dlp
17
  uuid
18
  gtts
19
- boto3
 
 
16
  yt_dlp
17
  uuid
18
  gtts
19
+ boto3
20
+ pydub