Spaces:
Sleeping
Sleeping
get_transcript_by_yt_api
Browse files
app.py
CHANGED
@@ -155,7 +155,7 @@ def check_open_ai_access(open_ai_api_key):
|
|
155 |
client = OpenAI(api_key=open_ai_api_key)
|
156 |
try:
|
157 |
response = client.chat.completions.create(
|
158 |
-
model="gpt-
|
159 |
messages=[
|
160 |
{"role": "user", "content": "This is a test."},
|
161 |
],
|
@@ -399,10 +399,18 @@ def get_transcript_by_yt_api(video_id):
|
|
399 |
|
400 |
for language in languages:
|
401 |
try:
|
402 |
-
|
403 |
print("===transcript===")
|
404 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
print("===transcript===")
|
|
|
|
|
406 |
return transcript # 成功獲取字幕,直接返回結果
|
407 |
except NoTranscriptFound:
|
408 |
continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
|
@@ -415,7 +423,7 @@ def generate_transcription_by_gemini(video_id):
|
|
415 |
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
416 |
|
417 |
# 初始化 Gemini Pro Vision 模型
|
418 |
-
model = vertexai.generative_models.GenerativeModel("gemini-2.
|
419 |
|
420 |
# 建立影片部分
|
421 |
video_part = Part.from_uri(
|
@@ -424,7 +432,7 @@ def generate_transcription_by_gemini(video_id):
|
|
424 |
)
|
425 |
|
426 |
# 設定提示詞
|
427 |
-
prompt = "
|
428 |
|
429 |
# 生成逐字稿
|
430 |
original_transcription = ""
|
@@ -434,7 +442,7 @@ def generate_transcription_by_gemini(video_id):
|
|
434 |
generation_config=vertexai.generative_models.GenerationConfig(
|
435 |
temperature=1.0,
|
436 |
top_p=0.95,
|
437 |
-
max_output_tokens=
|
438 |
candidate_count=1
|
439 |
),
|
440 |
stream=False
|
@@ -462,7 +470,7 @@ def generate_transcription_by_gemini(video_id):
|
|
462 |
|
463 |
def convert_transcription_to_json(original_transcription):
|
464 |
"""
|
465 |
-
將原始逐字稿轉換成指定的 JSON
|
466 |
|
467 |
Args:
|
468 |
original_transcription (str): 原始逐字稿文本
|
@@ -470,63 +478,104 @@ def convert_transcription_to_json(original_transcription):
|
|
470 |
Returns:
|
471 |
list: 包含逐字稿段落的列表,每個段落包含 text, start, end, duration
|
472 |
"""
|
473 |
-
|
|
|
|
|
|
|
474 |
# 使用 Vertex AI 來處理轉換
|
475 |
-
model = vertexai.generative_models.GenerativeModel("gemini-2.
|
476 |
|
477 |
-
|
478 |
-
|
479 |
-
|
|
|
|
|
|
|
|
|
480 |
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
4. 回傳格式為 JSON array
|
486 |
|
487 |
-
|
488 |
-
|
489 |
-
{{
|
490 |
-
"text": "在一片無人的森林裡",
|
491 |
-
"start": 1,
|
492 |
-
"end": 2,
|
493 |
-
"duration": 1
|
494 |
-
}},
|
495 |
-
{{
|
496 |
-
"text": "你撿到一張羊皮紙",
|
497 |
-
"start": 2,
|
498 |
-
"end": 4,
|
499 |
-
"duration": 2
|
500 |
-
}}
|
501 |
-
]
|
502 |
|
503 |
-
|
504 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
505 |
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
|
|
|
|
|
|
513 |
|
514 |
-
|
515 |
-
|
|
|
516 |
|
517 |
-
|
518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
519 |
|
520 |
-
|
521 |
-
|
522 |
-
if not all(k in entry for k in ["text", "start", "end", "duration"]):
|
523 |
-
raise ValueError("JSON 格式錯誤:缺少必要欄位")
|
524 |
-
|
525 |
-
return transcript_json
|
526 |
|
527 |
-
|
528 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
529 |
return None
|
|
|
|
|
|
|
|
|
|
|
530 |
|
531 |
def generate_transcription_by_whisper(video_id):
|
532 |
youtube_url = f'https://www.youtube.com/watch?v={video_id}'
|
@@ -607,9 +656,11 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
607 |
if not exists:
|
608 |
print("==== video transcript is not exists ====")
|
609 |
try:
|
610 |
-
transcript =
|
|
|
611 |
except Exception as e:
|
612 |
print(f"generate_transcription_by_gemini Error generating transcription: {str(e)}")
|
|
|
613 |
# transcript = generate_transcription_by_whisper(video_id)
|
614 |
|
615 |
upload_transcript_to_gcs(video_id, transcript)
|
@@ -640,6 +691,10 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
640 |
is_new_transcript = True
|
641 |
except Exception as e:
|
642 |
print(f"Error processing screenshot: {str(e)}")
|
|
|
|
|
|
|
|
|
643 |
else:
|
644 |
entry['img_file_id'] = ""
|
645 |
print(f"截圖空白")
|
|
|
155 |
client = OpenAI(api_key=open_ai_api_key)
|
156 |
try:
|
157 |
response = client.chat.completions.create(
|
158 |
+
model="gpt-4o",
|
159 |
messages=[
|
160 |
{"role": "user", "content": "This is a test."},
|
161 |
],
|
|
|
399 |
|
400 |
for language in languages:
|
401 |
try:
|
402 |
+
yt_api_transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
|
403 |
print("===transcript===")
|
404 |
+
print(yt_api_transcript)
|
405 |
+
|
406 |
+
transcript = ""
|
407 |
+
for entry in yt_api_transcript:
|
408 |
+
transcript_part = (f"{entry['start']:.0f}s: {entry['text']}")
|
409 |
+
print(transcript_part)
|
410 |
+
original_transcript += f"{transcript_part} \n"
|
411 |
print("===transcript===")
|
412 |
+
|
413 |
+
transcript = convert_transcription_to_json(original_transcript)
|
414 |
return transcript # 成功獲取字幕,直接返回結果
|
415 |
except NoTranscriptFound:
|
416 |
continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
|
|
|
423 |
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
424 |
|
425 |
# 初始化 Gemini Pro Vision 模型
|
426 |
+
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash-preview-05-20")
|
427 |
|
428 |
# 建立影片部分
|
429 |
video_part = Part.from_uri(
|
|
|
432 |
)
|
433 |
|
434 |
# 設定提示詞
|
435 |
+
prompt = "給我包含時間軸的完整逐字稿,包含時間軸跟原文內容,一句話一行"
|
436 |
|
437 |
# 生成逐字稿
|
438 |
original_transcription = ""
|
|
|
442 |
generation_config=vertexai.generative_models.GenerationConfig(
|
443 |
temperature=1.0,
|
444 |
top_p=0.95,
|
445 |
+
max_output_tokens=65535,
|
446 |
candidate_count=1
|
447 |
),
|
448 |
stream=False
|
|
|
470 |
|
471 |
def convert_transcription_to_json(original_transcription):
|
472 |
"""
|
473 |
+
將原始逐字稿轉換成指定的 JSON 格式,支援長文本分段處理
|
474 |
|
475 |
Args:
|
476 |
original_transcription (str): 原始逐字稿文本
|
|
|
478 |
Returns:
|
479 |
list: 包含逐字稿段落的列表,每個段落包含 text, start, end, duration
|
480 |
"""
|
481 |
+
if not original_transcription:
|
482 |
+
print("原始逐字稿為空")
|
483 |
+
return None
|
484 |
+
|
485 |
# 使用 Vertex AI 來處理轉換
|
486 |
+
model = vertexai.generative_models.GenerativeModel("gemini-2.5-flash-preview-05-20")
|
487 |
|
488 |
+
# 設定每段最大字數
|
489 |
+
# 考慮到:
|
490 |
+
# 1. Gemini 輸出限制為 65,535 tokens
|
491 |
+
# 2. 需要預留空間給系統提示詞
|
492 |
+
# 3. JSON 格式會增加額外字符
|
493 |
+
# 4. 中文一個字約等於 2-3 個 tokens
|
494 |
+
MAX_CHUNK_SIZE = 15000
|
495 |
|
496 |
+
# 分段處理
|
497 |
+
chunks = []
|
498 |
+
current_chunk = []
|
499 |
+
current_size = 0
|
|
|
500 |
|
501 |
+
# 按行分割文本
|
502 |
+
lines = original_transcription.split('\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
503 |
|
504 |
+
for line in lines:
|
505 |
+
line = line.strip()
|
506 |
+
if not line:
|
507 |
+
continue
|
508 |
+
|
509 |
+
# 如果這一行加入後會超過限制,就先處理當前chunk
|
510 |
+
if current_size + len(line) > MAX_CHUNK_SIZE and current_chunk:
|
511 |
+
chunks.append('\n'.join(current_chunk))
|
512 |
+
current_chunk = []
|
513 |
+
current_size = 0
|
514 |
+
|
515 |
+
current_chunk.append(line)
|
516 |
+
current_size += len(line)
|
517 |
|
518 |
+
# 處理最後一個chunk
|
519 |
+
if current_chunk:
|
520 |
+
chunks.append('\n'.join(current_chunk))
|
521 |
+
|
522 |
+
# 用於儲存所有處理結果
|
523 |
+
all_results = []
|
524 |
+
|
525 |
+
# 處理每個chunk
|
526 |
+
for i, chunk in enumerate(chunks):
|
527 |
+
print(f"===chunk: {i+1}===")
|
528 |
|
529 |
+
prompt = f"""
|
530 |
+
請將以下逐字稿轉換成 JSON 格式:
|
531 |
+
{chunk}
|
532 |
|
533 |
+
轉換規則:
|
534 |
+
1. 每個段落需包含 text, start, end, duration
|
535 |
+
2. 時間格式需轉換為秒數(例如 1:02 轉為 62 秒)
|
536 |
+
3. duration 為 end - start 的差值
|
537 |
+
4. 回傳格式為 JSON array
|
538 |
+
5. 合理的合併句子,不要有不合理的斷句,一句話至少要有完整的主詞、謂詞
|
539 |
+
6. 每句話盡量在 10~15 個字左右,但要以完整語意為主
|
540 |
+
7. 如果遇到 [Music] 這類的標記,可以直接忽略不計
|
541 |
+
8. 這是第 {i+1}/{len(chunks)} 段,請確保時間軸的連續性
|
542 |
|
543 |
+
請直接返回 JSON 格式,不要加入任何說明文字或 markdown 標記。
|
544 |
+
"""
|
|
|
|
|
|
|
|
|
545 |
|
546 |
+
try:
|
547 |
+
response = model.generate_content(prompt)
|
548 |
+
json_str = response.text
|
549 |
+
|
550 |
+
print(f"===json_str for chunk {i+1}===")
|
551 |
+
print(json_str)
|
552 |
+
print(f"===json_str for chunk {i+1}===")
|
553 |
+
|
554 |
+
# 移除可能的 markdown 標記
|
555 |
+
json_str = json_str.replace("```json", "").replace("```", "").strip()
|
556 |
+
|
557 |
+
# 解析 JSON
|
558 |
+
chunk_result = json.loads(json_str)
|
559 |
+
|
560 |
+
# 驗證格式
|
561 |
+
for entry in chunk_result:
|
562 |
+
if not all(k in entry for k in ["text", "start", "end", "duration"]):
|
563 |
+
raise ValueError(f"JSON 格式錯誤:缺少必要欄位,在第 {i+1} 段")
|
564 |
+
|
565 |
+
all_results.extend(chunk_result)
|
566 |
+
|
567 |
+
except Exception as e:
|
568 |
+
print(f"處理第 {i+1} 段時發生錯誤:{str(e)}")
|
569 |
+
continue
|
570 |
+
|
571 |
+
# 如果沒有任何有效結果,返回 None
|
572 |
+
if not all_results:
|
573 |
return None
|
574 |
+
|
575 |
+
# 按時間排序
|
576 |
+
all_results.sort(key=lambda x: x["start"])
|
577 |
+
|
578 |
+
return all_results
|
579 |
|
580 |
def generate_transcription_by_whisper(video_id):
|
581 |
youtube_url = f'https://www.youtube.com/watch?v={video_id}'
|
|
|
656 |
if not exists:
|
657 |
print("==== video transcript is not exists ====")
|
658 |
try:
|
659 |
+
transcript = get_transcript_by_yt_api(video_id)
|
660 |
+
# transcript = generate_transcription_by_gemini(video_id)
|
661 |
except Exception as e:
|
662 |
print(f"generate_transcription_by_gemini Error generating transcription: {str(e)}")
|
663 |
+
transcript = generate_transcription_by_gemini(video_id)
|
664 |
# transcript = generate_transcription_by_whisper(video_id)
|
665 |
|
666 |
upload_transcript_to_gcs(video_id, transcript)
|
|
|
691 |
is_new_transcript = True
|
692 |
except Exception as e:
|
693 |
print(f"Error processing screenshot: {str(e)}")
|
694 |
+
# 如果影片有下載成功,但是截圖失敗,則將 img_file_id 設為空字串
|
695 |
+
entry['img_file_id'] = ""
|
696 |
+
print(f"截圖空白")
|
697 |
+
is_new_transcript = True
|
698 |
else:
|
699 |
entry['img_file_id'] = ""
|
700 |
print(f"截圖空白")
|