youngtsai commited on
Commit
3e8ecd3
·
1 Parent(s): 36e3da3

add WebshareProxyConfig

Browse files
Files changed (2) hide show
  1. app.py +50 -19
  2. requirements.txt +1 -1
app.py CHANGED
@@ -48,6 +48,8 @@ from pydub import AudioSegment
48
 
49
 
50
  from youtube_transcript_api import YouTubeTranscriptApi
 
 
51
  from youtube_transcript_api._errors import NoTranscriptFound
52
  import yt_dlp
53
 
@@ -106,6 +108,8 @@ if is_env_local:
106
  AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
107
  AWS_REGION_NAME = config["AWS_REGION_NAME"]
108
  OUTPUT_PATH = config["OUTPUT_PATH"]
 
 
109
 
110
  else:
111
  IS_ENV_PROD = os.getenv("IS_ENV_PROD", "False")
@@ -124,6 +128,8 @@ else:
124
  AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
125
  AWS_REGION_NAME = 'us-west-2'
126
  OUTPUT_PATH = 'videos'
 
 
127
 
128
  TRANSCRIPTS = []
129
  CURRENT_INDEX = 0
@@ -391,40 +397,65 @@ def extract_youtube_id(url):
391
  else:
392
  return None
393
 
394
- def get_transcript_by_yt_api(video_id):
395
- print("====get_transcript_by_yt_api====")
396
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
397
- languages = []
398
- for t in transcript_list:
399
- languages.append(t.language_code)
400
-
 
 
 
 
 
 
 
401
  for language in languages:
 
 
 
402
  try:
403
- yt_api_transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
 
 
 
404
  print("===transcript===")
405
  print(yt_api_transcript)
406
-
407
  original_transcript = ""
408
  for entry in yt_api_transcript:
409
  transcript_part = (f"{entry['start']:.0f}s: {entry['text']}")
410
  print(transcript_part)
411
  original_transcript += f"{transcript_part} \n"
412
  print("===transcript===")
413
-
414
  transcript = convert_transcription_to_json(original_transcript)
415
- return transcript # 成功獲取字幕,直接返回結果
416
  except NoTranscriptFound:
417
- continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
418
- return None # 所有嘗試都失敗,返回None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
 
420
  def generate_transcription_by_gemini(video_id):
421
  """使用 Google Gemini 生成影片逐字稿"""
422
  print("====generate_transcription_by_gemini====")
423
  # 準備 YouTube 影片 URL
424
  video_url = f"https://www.youtube.com/watch?v={video_id}"
425
-
426
- # 初始化 Gemini Pro Vision 模型
427
- model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash-preview-05-20")
428
 
429
  # 建立影片部分
430
  video_part = Part.from_uri(
@@ -481,7 +512,7 @@ def convert_transcription_to_json(original_transcription):
481
  return None
482
 
483
  # 使用 Vertex AI 來處理轉換
484
- model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash-preview-05-20")
485
 
486
  # 設定每段最大字數
487
  # 考慮到:
@@ -535,7 +566,7 @@ def convert_transcription_to_json(original_transcription):
535
  4. 回傳格式為 JSON array
536
  5. 合理的合併句子,不要有不合理的斷句,一句話至少要有完整的主詞、謂詞
537
  6. 每句話盡量在 10~15 個字左右,但要��完整語意為主
538
- 7. 如果遇到 [Music] 這類的標記,可以直接忽略不計
539
  8. 這是第 {i+1}/{len(chunks)} 段,請確保時間軸的連續性
540
 
541
  請直接返回 JSON 格式,不要加入任何說明文字或 markdown 標記。
@@ -659,7 +690,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
659
  # transcript = generate_transcription_by_gemini(video_id)
660
  except Exception as e:
661
  print(f" Error generating transcription: {str(e)}")
662
- transcript = generate_transcription_by_gemini(video_id)
663
  # transcript = generate_transcription_by_whisper(video_id)
664
 
665
  upload_transcript_to_gcs(video_id, transcript)
 
48
 
49
 
50
  from youtube_transcript_api import YouTubeTranscriptApi
51
+ from youtube_transcript_api.proxies import WebshareProxyConfig
52
+
53
  from youtube_transcript_api._errors import NoTranscriptFound
54
  import yt_dlp
55
 
 
108
  AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
109
  AWS_REGION_NAME = config["AWS_REGION_NAME"]
110
  OUTPUT_PATH = config["OUTPUT_PATH"]
111
+ PROXY_USERNAME = config["PROXY_USERNAME"]
112
+ PROXY_PASSWORD = config["PROXY_PASSWORD"]
113
 
114
  else:
115
  IS_ENV_PROD = os.getenv("IS_ENV_PROD", "False")
 
128
  AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
129
  AWS_REGION_NAME = 'us-west-2'
130
  OUTPUT_PATH = 'videos'
131
+ PROXY_USERNAME = os.getenv("PROXY_USERNAME")
132
+ PROXY_PASSWORD = os.getenv("PROXY_PASSWORD")
133
 
134
  TRANSCRIPTS = []
135
  CURRENT_INDEX = 0
 
397
  else:
398
  return None
399
 
400
+ def try_get_transcript(video_id, use_proxy=False):
401
+ if use_proxy:
402
+ proxy_config = WebshareProxyConfig(
403
+ proxy_username=PROXY_USERNAME,
404
+ proxy_password=PROXY_PASSWORD
405
+ )
406
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxy_config)
407
+ else:
408
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
409
+ # 指定語言優先順序
410
+ language_priority = ["en", "zh-TW", "zh-CN", "ja"]
411
+ # 取得 transcript_list 中有的語言,並依照 priority 排序
412
+ available_languages = [t.language_code for t in transcript_list]
413
+ languages = [lang for lang in language_priority if lang in available_languages]
414
  for language in languages:
415
+ print("===language===")
416
+ print(f"use language: {language}")
417
+ print("===language===")
418
  try:
419
+ if use_proxy:
420
+ yt_api_transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language], proxies=proxy_config)
421
+ else:
422
+ yt_api_transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
423
  print("===transcript===")
424
  print(yt_api_transcript)
 
425
  original_transcript = ""
426
  for entry in yt_api_transcript:
427
  transcript_part = (f"{entry['start']:.0f}s: {entry['text']}")
428
  print(transcript_part)
429
  original_transcript += f"{transcript_part} \n"
430
  print("===transcript===")
 
431
  transcript = convert_transcription_to_json(original_transcript)
432
+ return transcript
433
  except NoTranscriptFound:
434
+ continue
435
+ raise NoTranscriptFound("No transcript found for supported languages.")
436
+
437
+ def get_transcript_by_yt_api(video_id):
438
+ print("====get_transcript_by_yt_api====")
439
+ # 先不用 proxy
440
+ try:
441
+ print("====try_get_transcript without proxy====")
442
+ return try_get_transcript(video_id, use_proxy=False)
443
+ except Exception as e1:
444
+ print(f"No proxy transcript error: {e1}")
445
+ # 再用 proxy
446
+ try:
447
+ print("====try_get_transcript with proxy====")
448
+ return try_get_transcript(video_id, use_proxy=True)
449
+ except Exception as e2:
450
+ print(f"With proxy transcript error: {e2}")
451
+ raise e2
452
 
453
  def generate_transcription_by_gemini(video_id):
454
  """使用 Google Gemini 生成影片逐字稿"""
455
  print("====generate_transcription_by_gemini====")
456
  # 準備 YouTube 影片 URL
457
  video_url = f"https://www.youtube.com/watch?v={video_id}"
458
+ model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash")
 
 
459
 
460
  # 建立影片部分
461
  video_part = Part.from_uri(
 
512
  return None
513
 
514
  # 使用 Vertex AI 來處理轉換
515
+ model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash")
516
 
517
  # 設定每段最大字數
518
  # 考慮到:
 
566
  4. 回傳格式為 JSON array
567
  5. 合理的合併句子,不要有不合理的斷句,一句話至少要有完整的主詞、謂詞
568
  6. 每句話盡量在 10~15 個字左右,但要��完整語意為主
569
+ 7. 如果遇到 [Music] [Laughter] [Crowd] [Cheering] [Applause]這類的標記,可以直接忽略不計
570
  8. 這是第 {i+1}/{len(chunks)} 段,請確保時間軸的連續性
571
 
572
  請直接返回 JSON 格式,不要加入任何說明文字或 markdown 標記。
 
690
  # transcript = generate_transcription_by_gemini(video_id)
691
  except Exception as e:
692
  print(f" Error generating transcription: {str(e)}")
693
+ # transcript = generate_transcription_by_gemini(video_id)
694
  # transcript = generate_transcription_by_whisper(video_id)
695
 
696
  upload_transcript_to_gcs(video_id, transcript)
requirements.txt CHANGED
@@ -3,7 +3,7 @@ pandas
3
  openai>=1.16.2
4
  requests
5
  python-docx
6
- youtube-transcript-api
7
  moviepy==1.0.3
8
  pytube
9
  google-auth
 
3
  openai>=1.16.2
4
  requests
5
  python-docx
6
+ youtube-transcript-api >= 1.1.0
7
  moviepy==1.0.3
8
  pytube
9
  google-auth