Spaces:
Sleeping
Sleeping
add WebshareProxyConfig
Browse files- app.py +50 -19
- requirements.txt +1 -1
app.py
CHANGED
@@ -48,6 +48,8 @@ from pydub import AudioSegment
|
|
48 |
|
49 |
|
50 |
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
|
51 |
from youtube_transcript_api._errors import NoTranscriptFound
|
52 |
import yt_dlp
|
53 |
|
@@ -106,6 +108,8 @@ if is_env_local:
|
|
106 |
AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
|
107 |
AWS_REGION_NAME = config["AWS_REGION_NAME"]
|
108 |
OUTPUT_PATH = config["OUTPUT_PATH"]
|
|
|
|
|
109 |
|
110 |
else:
|
111 |
IS_ENV_PROD = os.getenv("IS_ENV_PROD", "False")
|
@@ -124,6 +128,8 @@ else:
|
|
124 |
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
|
125 |
AWS_REGION_NAME = 'us-west-2'
|
126 |
OUTPUT_PATH = 'videos'
|
|
|
|
|
127 |
|
128 |
TRANSCRIPTS = []
|
129 |
CURRENT_INDEX = 0
|
@@ -391,40 +397,65 @@ def extract_youtube_id(url):
|
|
391 |
else:
|
392 |
return None
|
393 |
|
394 |
-
def
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
for language in languages:
|
|
|
|
|
|
|
402 |
try:
|
403 |
-
|
|
|
|
|
|
|
404 |
print("===transcript===")
|
405 |
print(yt_api_transcript)
|
406 |
-
|
407 |
original_transcript = ""
|
408 |
for entry in yt_api_transcript:
|
409 |
transcript_part = (f"{entry['start']:.0f}s: {entry['text']}")
|
410 |
print(transcript_part)
|
411 |
original_transcript += f"{transcript_part} \n"
|
412 |
print("===transcript===")
|
413 |
-
|
414 |
transcript = convert_transcription_to_json(original_transcript)
|
415 |
-
return transcript
|
416 |
except NoTranscriptFound:
|
417 |
-
continue
|
418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
419 |
|
420 |
def generate_transcription_by_gemini(video_id):
|
421 |
"""使用 Google Gemini 生成影片逐字稿"""
|
422 |
print("====generate_transcription_by_gemini====")
|
423 |
# 準備 YouTube 影片 URL
|
424 |
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
425 |
-
|
426 |
-
# 初始化 Gemini Pro Vision 模型
|
427 |
-
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash-preview-05-20")
|
428 |
|
429 |
# 建立影片部分
|
430 |
video_part = Part.from_uri(
|
@@ -481,7 +512,7 @@ def convert_transcription_to_json(original_transcription):
|
|
481 |
return None
|
482 |
|
483 |
# 使用 Vertex AI 來處理轉換
|
484 |
-
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash
|
485 |
|
486 |
# 設定每段最大字數
|
487 |
# 考慮到:
|
@@ -535,7 +566,7 @@ def convert_transcription_to_json(original_transcription):
|
|
535 |
4. 回傳格式為 JSON array
|
536 |
5. 合理的合併句子,不要有不合理的斷句,一句話至少要有完整的主詞、謂詞
|
537 |
6. 每句話盡量在 10~15 個字左右,但要��完整語意為主
|
538 |
-
7. 如果遇到 [Music] 這類的標記,可以直接忽略不計
|
539 |
8. 這是第 {i+1}/{len(chunks)} 段,請確保時間軸的連續性
|
540 |
|
541 |
請直接返回 JSON 格式,不要加入任何說明文字或 markdown 標記。
|
@@ -659,7 +690,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
659 |
# transcript = generate_transcription_by_gemini(video_id)
|
660 |
except Exception as e:
|
661 |
print(f" Error generating transcription: {str(e)}")
|
662 |
-
transcript = generate_transcription_by_gemini(video_id)
|
663 |
# transcript = generate_transcription_by_whisper(video_id)
|
664 |
|
665 |
upload_transcript_to_gcs(video_id, transcript)
|
|
|
48 |
|
49 |
|
50 |
from youtube_transcript_api import YouTubeTranscriptApi
|
51 |
+
from youtube_transcript_api.proxies import WebshareProxyConfig
|
52 |
+
|
53 |
from youtube_transcript_api._errors import NoTranscriptFound
|
54 |
import yt_dlp
|
55 |
|
|
|
108 |
AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
|
109 |
AWS_REGION_NAME = config["AWS_REGION_NAME"]
|
110 |
OUTPUT_PATH = config["OUTPUT_PATH"]
|
111 |
+
PROXY_USERNAME = config["PROXY_USERNAME"]
|
112 |
+
PROXY_PASSWORD = config["PROXY_PASSWORD"]
|
113 |
|
114 |
else:
|
115 |
IS_ENV_PROD = os.getenv("IS_ENV_PROD", "False")
|
|
|
128 |
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
|
129 |
AWS_REGION_NAME = 'us-west-2'
|
130 |
OUTPUT_PATH = 'videos'
|
131 |
+
PROXY_USERNAME = os.getenv("PROXY_USERNAME")
|
132 |
+
PROXY_PASSWORD = os.getenv("PROXY_PASSWORD")
|
133 |
|
134 |
TRANSCRIPTS = []
|
135 |
CURRENT_INDEX = 0
|
|
|
397 |
else:
|
398 |
return None
|
399 |
|
400 |
+
def try_get_transcript(video_id, use_proxy=False):
|
401 |
+
if use_proxy:
|
402 |
+
proxy_config = WebshareProxyConfig(
|
403 |
+
proxy_username=PROXY_USERNAME,
|
404 |
+
proxy_password=PROXY_PASSWORD
|
405 |
+
)
|
406 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxy_config)
|
407 |
+
else:
|
408 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
409 |
+
# 指定語言優先順序
|
410 |
+
language_priority = ["en", "zh-TW", "zh-CN", "ja"]
|
411 |
+
# 取得 transcript_list 中有的語言,並依照 priority 排序
|
412 |
+
available_languages = [t.language_code for t in transcript_list]
|
413 |
+
languages = [lang for lang in language_priority if lang in available_languages]
|
414 |
for language in languages:
|
415 |
+
print("===language===")
|
416 |
+
print(f"use language: {language}")
|
417 |
+
print("===language===")
|
418 |
try:
|
419 |
+
if use_proxy:
|
420 |
+
yt_api_transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language], proxies=proxy_config)
|
421 |
+
else:
|
422 |
+
yt_api_transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
|
423 |
print("===transcript===")
|
424 |
print(yt_api_transcript)
|
|
|
425 |
original_transcript = ""
|
426 |
for entry in yt_api_transcript:
|
427 |
transcript_part = (f"{entry['start']:.0f}s: {entry['text']}")
|
428 |
print(transcript_part)
|
429 |
original_transcript += f"{transcript_part} \n"
|
430 |
print("===transcript===")
|
|
|
431 |
transcript = convert_transcription_to_json(original_transcript)
|
432 |
+
return transcript
|
433 |
except NoTranscriptFound:
|
434 |
+
continue
|
435 |
+
raise NoTranscriptFound("No transcript found for supported languages.")
|
436 |
+
|
437 |
+
def get_transcript_by_yt_api(video_id):
|
438 |
+
print("====get_transcript_by_yt_api====")
|
439 |
+
# 先不用 proxy
|
440 |
+
try:
|
441 |
+
print("====try_get_transcript without proxy====")
|
442 |
+
return try_get_transcript(video_id, use_proxy=False)
|
443 |
+
except Exception as e1:
|
444 |
+
print(f"No proxy transcript error: {e1}")
|
445 |
+
# 再用 proxy
|
446 |
+
try:
|
447 |
+
print("====try_get_transcript with proxy====")
|
448 |
+
return try_get_transcript(video_id, use_proxy=True)
|
449 |
+
except Exception as e2:
|
450 |
+
print(f"With proxy transcript error: {e2}")
|
451 |
+
raise e2
|
452 |
|
453 |
def generate_transcription_by_gemini(video_id):
|
454 |
"""使用 Google Gemini 生成影片逐字稿"""
|
455 |
print("====generate_transcription_by_gemini====")
|
456 |
# 準備 YouTube 影片 URL
|
457 |
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
458 |
+
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash")
|
|
|
|
|
459 |
|
460 |
# 建立影片部分
|
461 |
video_part = Part.from_uri(
|
|
|
512 |
return None
|
513 |
|
514 |
# 使用 Vertex AI 來處理轉換
|
515 |
+
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash")
|
516 |
|
517 |
# 設定每段最大字數
|
518 |
# 考慮到:
|
|
|
566 |
4. 回傳格式為 JSON array
|
567 |
5. 合理的合併句子,不要有不合理的斷句,一句話至少要有完整的主詞、謂詞
|
568 |
6. 每句話盡量在 10~15 個字左右,但要��完整語意為主
|
569 |
+
7. 如果遇到 [Music] [Laughter] [Crowd] [Cheering] [Applause]這類的標記,可以直接忽略不計
|
570 |
8. 這是第 {i+1}/{len(chunks)} 段,請確保時間軸的連續性
|
571 |
|
572 |
請直接返回 JSON 格式,不要加入任何說明文字或 markdown 標記。
|
|
|
690 |
# transcript = generate_transcription_by_gemini(video_id)
|
691 |
except Exception as e:
|
692 |
print(f" Error generating transcription: {str(e)}")
|
693 |
+
# transcript = generate_transcription_by_gemini(video_id)
|
694 |
# transcript = generate_transcription_by_whisper(video_id)
|
695 |
|
696 |
upload_transcript_to_gcs(video_id, transcript)
|
requirements.txt
CHANGED
@@ -3,7 +3,7 @@ pandas
|
|
3 |
openai>=1.16.2
|
4 |
requests
|
5 |
python-docx
|
6 |
-
youtube-transcript-api
|
7 |
moviepy==1.0.3
|
8 |
pytube
|
9 |
google-auth
|
|
|
3 |
openai>=1.16.2
|
4 |
requests
|
5 |
python-docx
|
6 |
+
youtube-transcript-api >= 1.1.0
|
7 |
moviepy==1.0.3
|
8 |
pytube
|
9 |
google-auth
|