Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
from docx import Document | |
import os | |
from openai import OpenAI | |
import json | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from youtube_transcript_api._errors import NoTranscriptFound | |
from moviepy.editor import VideoFileClip | |
from pytube import YouTube | |
import os | |
from google.cloud import storage | |
from google.oauth2 import service_account | |
from googleapiclient.discovery import build | |
from googleapiclient.http import MediaFileUpload | |
from googleapiclient.http import MediaIoBaseDownload | |
from googleapiclient.http import MediaIoBaseUpload | |
import io | |
from urllib.parse import urlparse, parse_qs | |
# 假设您的环境变量或Secret的名称是GOOGLE_APPLICATION_CREDENTIALS_JSON | |
# credentials_json_string = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON") | |
# credentials_dict = json.loads(credentials_json_string) | |
# SCOPES = ['https://www.googleapis.com/auth/drive'] | |
# credentials = service_account.Credentials.from_service_account_info( | |
# credentials_dict, scopes=SCOPES) | |
# service = build('drive', 'v3', credentials=credentials) | |
# # 列出 Google Drive 上的前10個文件 | |
# results = service.files().list(pageSize=10, fields="nextPageToken, files(id, name)").execute() | |
# items = results.get('files', []) | |
# if not items: | |
# print('No files found.') | |
# else: | |
# print("=====Google Drive 上的前10個文件=====") | |
# print('Files:') | |
# for item in items: | |
# print(u'{0} ({1})'.format(item['name'], item['id'])) | |
OUTPUT_PATH = 'videos' | |
TRANSCRIPTS = [] | |
CURRENT_INDEX = 0 | |
VIDEO_ID = "" | |
OPEN_AI_KEY = os.getenv("OPEN_AI_KEY") | |
client = OpenAI(api_key=OPEN_AI_KEY) | |
DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON") | |
GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON") | |
# ====gcs==== | |
def init_gcs_client(service_account_key_string): | |
"""使用服务账号密钥文件创建 GCS 客户端""" | |
credentials_json_string = service_account_key_string | |
credentials_dict = json.loads(credentials_json_string) | |
credentials = service_account.Credentials.from_service_account_info(credentials_dict) | |
gcs_client = storage.Client(credentials=credentials, project=credentials_dict['project_id']) | |
return gcs_client | |
def gcs_create_bucket_folder_if_not_exists(gcs_client, bucket_name, folder_name): | |
"""检查是否存在特定名称的文件夹(前缀),如果不存在则创建一个标记文件来模拟文件夹""" | |
bucket = gcs_client.bucket(bucket_name) | |
blob = bucket.blob(folder_name) | |
if not blob.exists(): | |
blob.upload_from_string('', content_type='application/x-www-form-urlencoded;charset=UTF-8') | |
print(f"GCS Folder '{folder_name}' created.") | |
else: | |
print(f"GCS Folder '{folder_name}' already exists.") | |
def gcs_check_folder_exists(gcs_client, bucket_name, folder_name): | |
"""检查 GCS 存储桶中是否存在指定的文件夹""" | |
bucket = gcs_client.bucket(bucket_name) | |
blobs = list(bucket.list_blobs(prefix=folder_name)) | |
return len(blobs) > 0 | |
def gcs_check_file_exists(gcs_client, bucket_name, file_name): | |
""" | |
检查 GCS 存储桶中是否存在指定的文件 | |
file_name 格式:{folder_name}/{file_name} | |
""" | |
bucket = gcs_client.bucket(bucket_name) | |
blob = bucket.blob(file_name) | |
return blob.exists() | |
def upload_file_to_gcs(gcs_client, bucket_name, destination_blob_name, file_path): | |
"""上传文件到指定的 GCS 存储桶""" | |
bucket = gcs_client.bucket(bucket_name) | |
blob = bucket.blob(destination_blob_name) | |
blob.upload_from_filename(file_path) | |
print(f"File {file_path} uploaded to {destination_blob_name} in GCS.") | |
def upload_file_to_gcs_with_json_string(gcs_client, bucket_name, destination_blob_name, json_string): | |
"""上传字符串到指定的 GCS 存储桶""" | |
bucket = gcs_client.bucket(bucket_name) | |
blob = bucket.blob(destination_blob_name) | |
blob.upload_from_string(json_string) | |
print(f"JSON string uploaded to {destination_blob_name} in GCS.") | |
def download_blob_to_string(gcs_client, bucket_name, source_blob_name): | |
"""从 GCS 下载文件内容到字符串""" | |
bucket = gcs_client.bucket(bucket_name) | |
blob = bucket.blob(source_blob_name) | |
return blob.download_as_text() | |
def make_blob_public(gcs_client, bucket_name, blob_name): | |
"""将指定的 GCS 对象设置为公共可读""" | |
bucket = gcs_client.bucket(bucket_name) | |
blob = bucket.blob(blob_name) | |
blob.make_public() | |
print(f"Blob {blob_name} is now publicly accessible at {blob.public_url}") | |
def get_blob_public_url(gcs_client, bucket_name, blob_name): | |
"""获取指定 GCS 对象的公开 URL""" | |
bucket = gcs_client.bucket(bucket_name) | |
blob = bucket.blob(blob_name) | |
return blob.public_url | |
def upload_img_and_get_public_url(gcs_client, bucket_name, file_name, file_path): | |
"""上传图片到 GCS 并获取其公开 URL""" | |
# 上传图片 | |
upload_file_to_gcs(gcs_client, bucket_name, file_name, file_path) | |
# 将上传的图片设置为公开 | |
make_blob_public(gcs_client, bucket_name, file_name) | |
# 获取图片的公开 URL | |
public_url = get_blob_public_url(gcs_client, bucket_name, file_name) | |
print(f"Public URL for the uploaded image: {public_url}") | |
return public_url | |
def copy_all_files_from_drive_to_gcs(drive_service, gcs_client, drive_folder_id, bucket_name, gcs_folder_name): | |
# Get all files from the folder | |
query = f"'{drive_folder_id}' in parents and trashed = false" | |
response = drive_service.files().list(q=query).execute() | |
files = response.get('files', []) | |
for file in files: | |
# Copy each file to GCS | |
file_id = file['id'] | |
file_name = file['name'] | |
gcs_destination_path = f"{gcs_folder_name}/{file_name}" | |
copy_file_from_drive_to_gcs(drive_service, gcs_client, file_id, bucket_name, gcs_destination_path) | |
def copy_file_from_drive_to_gcs(drive_service, gcs_client, file_id, bucket_name, gcs_destination_path): | |
# Download file content from Drive | |
request = drive_service.files().get_media(fileId=file_id) | |
fh = io.BytesIO() | |
downloader = MediaIoBaseDownload(fh, request) | |
done = False | |
while not done: | |
status, done = downloader.next_chunk() | |
fh.seek(0) | |
file_content = fh.getvalue() | |
# Upload file content to GCS | |
bucket = gcs_client.bucket(bucket_name) | |
blob = bucket.blob(gcs_destination_path) | |
blob.upload_from_string(file_content) | |
print(f"File {file_id} copied to GCS at {gcs_destination_path}.") | |
# # ====drive====初始化 | |
def init_drive_service(): | |
credentials_json_string = DRIVE_KEY | |
credentials_dict = json.loads(credentials_json_string) | |
SCOPES = ['https://www.googleapis.com/auth/drive'] | |
credentials = service_account.Credentials.from_service_account_info( | |
credentials_dict, scopes=SCOPES) | |
service = build('drive', 'v3', credentials=credentials) | |
return service | |
def create_folder_if_not_exists(service, folder_name, parent_id): | |
print("检查是否存在特定名称的文件夹,如果不存在则创建") | |
query = f"mimeType='application/vnd.google-apps.folder' and name='{folder_name}' and '{parent_id}' in parents and trashed=false" | |
response = service.files().list(q=query, spaces='drive', fields="files(id, name)").execute() | |
folders = response.get('files', []) | |
if not folders: | |
# 文件夹不存在,创建新文件夹 | |
file_metadata = { | |
'name': folder_name, | |
'mimeType': 'application/vnd.google-apps.folder', | |
'parents': [parent_id] | |
} | |
folder = service.files().create(body=file_metadata, fields='id').execute() | |
return folder.get('id') | |
else: | |
# 文件夹已存在 | |
return folders[0]['id'] | |
# 检查Google Drive上是否存在文件 | |
def check_file_exists(service, folder_name, file_name): | |
query = f"name = '{file_name}' and '{folder_name}' in parents and trashed = false" | |
response = service.files().list(q=query).execute() | |
files = response.get('files', []) | |
return len(files) > 0, files[0]['id'] if files else None | |
def upload_content_directly(service, file_name, folder_id, content): | |
""" | |
直接将内容上传到Google Drive中的新文件。 | |
""" | |
if not file_name: | |
raise ValueError("文件名不能为空") | |
if not folder_id: | |
raise ValueError("文件夹ID不能为空") | |
if content is None: # 允许空字符串上传,但不允许None | |
raise ValueError("内容不能为空") | |
file_metadata = {'name': file_name, 'parents': [folder_id]} | |
# 使用io.BytesIO为文本内容创建一个内存中的文件对象 | |
try: | |
with io.BytesIO(content.encode('utf-8')) as fh: | |
media = MediaIoBaseUpload(fh, mimetype='text/plain', resumable=True) | |
print("==content==") | |
print(content) | |
print("==content==") | |
print("==media==") | |
print(media) | |
print("==media==") | |
# 执行上传 | |
file = service.files().create(body=file_metadata, media_body=media, fields='id').execute() | |
return file.get('id') | |
except Exception as e: | |
print(f"上传文件时发生错误: {e}") | |
raise # 重新抛出异常,调用者可以根据需要处理或忽略 | |
def upload_file_directly(service, file_name, folder_id, file_path): | |
# 上傳 .json to Google Drive | |
file_metadata = {'name': file_name, 'parents': [folder_id]} | |
media = MediaFileUpload(file_path, mimetype='application/json') | |
file = service.files().create(body=file_metadata, media_body=media, fields='id').execute() | |
# return file.get('id') # 返回文件ID | |
return True | |
def upload_img_directly(service, file_name, folder_id, file_path): | |
file_metadata = {'name': file_name, 'parents': [folder_id]} | |
media = MediaFileUpload(file_path, mimetype='image/jpeg') | |
file = service.files().create(body=file_metadata, media_body=media, fields='id').execute() | |
return file.get('id') # 返回文件ID | |
def download_file_as_string(service, file_id): | |
""" | |
从Google Drive下载文件并将其作为字符串返回。 | |
""" | |
request = service.files().get_media(fileId=file_id) | |
fh = io.BytesIO() | |
downloader = MediaIoBaseDownload(fh, request) | |
done = False | |
while done is False: | |
status, done = downloader.next_chunk() | |
fh.seek(0) | |
content = fh.read().decode('utf-8') | |
return content | |
def set_public_permission(service, file_id): | |
service.permissions().create( | |
fileId=file_id, | |
body={"type": "anyone", "role": "reader"}, | |
fields='id', | |
).execute() | |
def update_file_on_drive(service, file_id, file_content): | |
""" | |
更新Google Drive上的文件内容。 | |
参数: | |
- service: Google Drive API服务实例。 | |
- file_id: 要更新的文件的ID。 | |
- file_content: 新的文件内容,字符串格式。 | |
""" | |
# 将新的文件内容转换为字节流 | |
fh = io.BytesIO(file_content.encode('utf-8')) | |
media = MediaIoBaseUpload(fh, mimetype='application/json', resumable=True) | |
# 更新文件 | |
updated_file = service.files().update( | |
fileId=file_id, | |
media_body=media | |
).execute() | |
print(f"文件已更新,文件ID: {updated_file['id']}") | |
# ====drive==== | |
def process_file(file): | |
# 读取文件 | |
if file.name.endswith('.csv'): | |
df = pd.read_csv(file) | |
text = df_to_text(df) | |
elif file.name.endswith('.xlsx'): | |
df = pd.read_excel(file) | |
text = df_to_text(df) | |
elif file.name.endswith('.docx'): | |
text = docx_to_text(file) | |
else: | |
raise ValueError("Unsupported file type") | |
df_string = df.to_string() | |
# 宜蘭:移除@XX@符号 to | | |
df_string = df_string.replace("@XX@", "|") | |
# 根据上传的文件内容生成问题 | |
questions = generate_questions(df_string) | |
summary = generate_summarise(df_string) | |
# 返回按钮文本和 DataFrame 字符串 | |
return questions[0] if len(questions) > 0 else "", \ | |
questions[1] if len(questions) > 1 else "", \ | |
questions[2] if len(questions) > 2 else "", \ | |
summary, \ | |
df_string | |
def df_to_text(df): | |
# 将 DataFrame 转换为纯文本 | |
return df.to_string() | |
def docx_to_text(file): | |
# 将 Word 文档转换为纯文本 | |
doc = Document(file) | |
return "\n".join([para.text for para in doc.paragraphs]) | |
def format_seconds_to_time(seconds): | |
"""将秒数格式化为 时:分:秒 的形式""" | |
hours = int(seconds // 3600) | |
minutes = int((seconds % 3600) // 60) | |
seconds = int(seconds % 60) | |
return f"{hours:02}:{minutes:02}:{seconds:02}" | |
def extract_youtube_id(url): | |
parsed_url = urlparse(url) | |
if "youtube.com" in parsed_url.netloc: | |
# 对于标准链接,视频ID在查询参数'v'中 | |
query_params = parse_qs(parsed_url.query) | |
return query_params.get("v")[0] if "v" in query_params else None | |
elif "youtu.be" in parsed_url.netloc: | |
# 对于短链接,视频ID是路径的一部分 | |
return parsed_url.path.lstrip('/') | |
else: | |
return None | |
def get_transcript(video_id): | |
languages = ['zh-TW', 'zh-Hant', 'en'] # 優先順序列表 | |
for language in languages: | |
try: | |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language]) | |
return transcript # 成功獲取字幕,直接返回結果 | |
except NoTranscriptFound: | |
continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言 | |
return None # 所有嘗試都失敗,返回None | |
def process_transcript_and_screenshots(video_id): | |
print("====process_transcript_and_screenshots====") | |
# Drive | |
service = init_drive_service() | |
parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL' | |
folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id) | |
# 逐字稿文件名 | |
file_name = f'{video_id}_transcript.json' | |
# 检查逐字稿是否存在 | |
exists, file_id = check_file_exists(service, folder_id, file_name) | |
if not exists: | |
# 从YouTube获取逐字稿并上传 | |
transcript = get_transcript(video_id) | |
if transcript: | |
print("成功獲取字幕") | |
else: | |
print("沒有找到字幕") | |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2) | |
file_id = upload_content_directly(service, file_name, folder_id, transcript_text) | |
print("逐字稿已上传到Google Drive") | |
else: | |
# 逐字稿已存在,下载逐字稿内容 | |
print("逐字稿已存在于Google Drive中") | |
transcript_text = download_file_as_string(service, file_id) | |
transcript = json.loads(transcript_text) | |
# 处理逐字稿中的每个条目,检查并上传截图 | |
for entry in transcript: | |
if 'img_file_id' not in entry: | |
screenshot_path = screenshot_youtube_video(video_id, entry['start']) | |
img_file_id = upload_img_directly(service, f"{video_id}_{entry['start']}.jpg", folder_id, screenshot_path) | |
set_public_permission(service, img_file_id) | |
entry['img_file_id'] = img_file_id | |
print(f"截图已上传到Google Drive: {img_file_id}") | |
# 更新逐字稿文件 | |
updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2) | |
update_file_on_drive(service, file_id, updated_transcript_text) | |
print("逐字稿已更新,包括截图链接") | |
# init gcs client | |
gcs_client = init_gcs_client(GCS_KEY) | |
bucket_name = 'video_ai_assistant' | |
# 检查 folder 是否存在 | |
is_gcs_exists = gcs_check_folder_exists(gcs_client, bucket_name, video_id) | |
if not is_gcs_exists: | |
gcs_create_bucket_folder_if_not_exists(gcs_client, bucket_name, video_id) | |
copy_all_files_from_drive_to_gcs(service, gcs_client, folder_id, bucket_name, video_id) | |
print("Drive file 已上传到GCS") | |
else: | |
print("GCS folder:{video_id} 已存在") | |
return transcript | |
def process_transcript_and_screenshots_on_gcs(video_id): | |
print("====process_transcript_and_screenshots_on_gcs====") | |
# GCS | |
gcs_client = init_gcs_client(GCS_KEY) | |
bucket_name = 'video_ai_assistant' | |
# 检查 folder 是否存在 | |
# is_gcs_exists = gcs_check_folder_exists(gcs_client, bucket_name, video_id) | |
# if not is_gcs_exists: | |
# gcs_create_bucket_folder_if_not_exists(gcs_client, bucket_name, video_id) | |
# print("GCS folder:{video_id} 已创建") | |
# else: | |
# print("GCS folder:{video_id} 已存在") | |
# 逐字稿文件名 | |
transcript_file_name = f'{video_id}_transcript.json' | |
transcript_blob_name = f"{video_id}/{transcript_file_name}" | |
# 检查逐字稿是否存在 | |
exists = gcs_check_file_exists(gcs_client, bucket_name, transcript_blob_name) | |
if not exists: | |
# 从YouTube获取逐字稿并上传 | |
transcript = get_transcript(video_id) | |
if transcript: | |
print("成功獲取字幕") | |
else: | |
print("沒有找到字幕") | |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2) | |
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, transcript_blob_name, transcript_text) | |
print("逐字稿已上传到GCS") | |
for entry in transcript: | |
if 'img_file_id' not in entry: | |
screenshot_path = screenshot_youtube_video(video_id, entry['start']) | |
transcript_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg" | |
img_file_id = upload_img_and_get_public_url(gcs_client, bucket_name, transcript_blob_name, screenshot_path) | |
entry['img_file_id'] = img_file_id | |
print(f"截图已上传到GCS: {img_file_id}") | |
# 更新逐字稿文件 | |
updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2) | |
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, transcript_blob_name, updated_transcript_text) | |
print("逐字稿已更新,包括截图链接") | |
return transcript | |
def process_youtube_link(link): | |
# 使用 YouTube API 获取逐字稿 | |
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中 | |
video_id = extract_youtube_id(link) | |
global VIDEO_ID | |
VIDEO_ID = video_id | |
download_youtube_video(video_id, output_path=OUTPUT_PATH) | |
try: | |
# transcript = process_transcript_and_screenshots(video_id) | |
transcript = process_transcript_and_screenshots_on_gcs(video_id) | |
except Exception as e: | |
error_msg = f" {video_id} 逐字稿錯誤: {str(e)}" | |
print("===process_youtube_link error===") | |
print(error_msg) | |
raise gr.Error(error_msg) | |
formatted_transcript = [] | |
formatted_simple_transcript =[] | |
screenshot_paths = [] | |
for entry in transcript: | |
start_time = format_seconds_to_time(entry['start']) | |
end_time = format_seconds_to_time(entry['start'] + entry['duration']) | |
embed_url = get_embedded_youtube_link(video_id, entry['start']) | |
img_file_id = entry['img_file_id'] | |
screenshot_path = f"https://lh3.googleusercontent.com/d/{img_file_id}=s4000" | |
line = { | |
"start_time": start_time, | |
"end_time": end_time, | |
"text": entry['text'], | |
"embed_url": embed_url, | |
"screenshot_path": screenshot_path | |
} | |
formatted_transcript.append(line) | |
# formatted_simple_transcript 只要 start_time, end_time, text | |
simple_line = { | |
"start_time": start_time, | |
"end_time": end_time, | |
"text": entry['text'] | |
} | |
formatted_simple_transcript.append(simple_line) | |
screenshot_paths.append(screenshot_path) | |
global TRANSCRIPTS | |
TRANSCRIPTS = formatted_transcript | |
# 基于逐字稿生成其他所需的输出 | |
questions = get_questions(video_id, formatted_simple_transcript) | |
formatted_transcript_json = json.dumps(formatted_transcript, ensure_ascii=False, indent=2) | |
summary_json = get_video_id_summary(video_id, formatted_simple_transcript) | |
summary = summary_json["summary"] | |
html_content = format_transcript_to_html(formatted_transcript) | |
simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript) | |
first_image = formatted_transcript[0]['screenshot_path'] | |
first_text = formatted_transcript[0]['text'] | |
mind_map_json = get_mind_map(video_id, formatted_simple_transcript) | |
mind_map = mind_map_json["mind_map"] | |
mind_map_html = get_mind_map_html(mind_map) | |
# 确保返回与 UI 组件预期匹配的输出 | |
return questions[0] if len(questions) > 0 else "", \ | |
questions[1] if len(questions) > 1 else "", \ | |
questions[2] if len(questions) > 2 else "", \ | |
formatted_transcript_json, \ | |
summary, \ | |
mind_map, \ | |
mind_map_html, \ | |
html_content, \ | |
simple_html_content, \ | |
first_image, \ | |
first_text, | |
def format_transcript_to_html(formatted_transcript): | |
html_content = "" | |
for entry in formatted_transcript: | |
html_content += f"<h3>{entry['start_time']} - {entry['end_time']}</h3>" | |
html_content += f"<p>{entry['text']}</p>" | |
html_content += f"<img src='{entry['screenshot_path']}' width='500px' />" | |
return html_content | |
def format_simple_transcript_to_html(formatted_transcript): | |
html_content = "" | |
for entry in formatted_transcript: | |
html_content += f"<h3>{entry['start_time']} - {entry['end_time']}</h3>" | |
html_content += f"<p>{entry['text']}</p>" | |
return html_content | |
def get_embedded_youtube_link(video_id, start_time): | |
int_start_time = int(start_time) | |
embed_url = f"https://www.youtube.com/embed/{video_id}?start={int_start_time}&autoplay=1" | |
return embed_url | |
def download_youtube_video(youtube_id, output_path=OUTPUT_PATH): | |
# Construct the full YouTube URL | |
youtube_url = f'https://www.youtube.com/watch?v={youtube_id}' | |
# Create the output directory if it doesn't exist | |
if not os.path.exists(output_path): | |
os.makedirs(output_path) | |
# Download the video | |
yt = YouTube(youtube_url) | |
video_stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first() | |
video_stream.download(output_path=output_path, filename=youtube_id+".mp4") | |
print(f"Video downloaded successfully: {output_path}/{youtube_id}.mp4") | |
def screenshot_youtube_video(youtube_id, snapshot_sec): | |
video_path = f'{OUTPUT_PATH}/{youtube_id}.mp4' | |
file_name = f"{youtube_id}_{snapshot_sec}.jpg" | |
with VideoFileClip(video_path) as video: | |
screenshot_path = f'{OUTPUT_PATH}/{file_name}' | |
video.save_frame(screenshot_path, snapshot_sec) | |
return screenshot_path | |
def process_web_link(link): | |
# 抓取和解析网页内容 | |
response = requests.get(link) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
return soup.get_text() | |
def get_mind_map(video_id, df_string): | |
# 先抓 g drive 看看有沒有 {video_id}_mind_map.json | |
print("===get_mind_map===") | |
service = init_drive_service() | |
parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL' | |
folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id) | |
file_name = f'{video_id}_mind_map.json' | |
# 检查檔案是否存在 | |
exists, file_id = check_file_exists(service, folder_id, file_name) | |
if not exists: | |
mind_map = generate_mind_map(df_string) | |
mind_map_json = {"mind_map": str(mind_map)} | |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2) | |
upload_content_directly(service, file_name, folder_id, mind_map_text) | |
print("mind_map已上傳到Google Drive") | |
else: | |
# mindmap已存在,下载内容 | |
print("mind_map已存在于Google Drive中") | |
mind_map_text = download_file_as_string(service, file_id) | |
mind_map_json = json.loads(mind_map_text) | |
return mind_map_json | |
def generate_mind_map(df_string): | |
# 使用 OpenAI 生成基于上传数据的问题 | |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW" | |
user_content = f""" | |
請根據 {df_string} 文本建立 markdown 心智圖 | |
注意:不需要前後文敘述,直接給出 markdown 文本即可 | |
這對我很重要 | |
""" | |
messages = [ | |
{"role": "system", "content": sys_content}, | |
{"role": "user", "content": user_content} | |
] | |
request_payload = { | |
"model": "gpt-4-1106-preview", | |
"messages": messages, | |
"max_tokens": 4000, | |
} | |
response = client.chat.completions.create(**request_payload) | |
mind_map = response.choices[0].message.content.strip() | |
print("=====mind_map=====") | |
print(mind_map) | |
print("=====mind_map=====") | |
return mind_map | |
def get_mind_map_html(mind_map): | |
mind_map_markdown = mind_map.replace("```markdown", "").replace("```", "") | |
mind_map_html = f""" | |
<div class="markmap"> | |
<script type="text/template"> | |
{mind_map_markdown} | |
</script> | |
</div> | |
""" | |
return mind_map_html | |
def processed_video_summary_to_json(summary): | |
""" | |
整體格式為: | |
1. 內容類型 | |
2. 整體摘要 | |
3. 條列式重點 | |
4. 關鍵時刻(段落摘要) | |
5. 結論反思(為什麼我們要學這個?) | |
6. 延伸小問題 | |
使用 regex 拆解 summary 抓取各個部分 | |
example: | |
1. 內容類型:影片類型\n\n2. 整體摘要\n本段影片透過一組劇情式的場景講述,描述了一群人物進行VR教育體驗的故事,涵蓋了冒險、探索、學習和救援等元素。影片同時充分融合了互動問答和地理科學知識,並對南極和北極的地理環境、生態系統以及國際政治局勢進行了介紹。\n\n3. 條列式重點\n- VR教育體驗的場景設置。\n- 冒險遊戲中融入地理科學知識。\n- 南北極的環境差異和重要性。\n- 介紹了南極條約的內容。\n- 探討全球暖化對極地生物的影響。\n\n4. 關鍵時刻(段落摘要)\n【00:00:05 - 00:00:21】: 一群人物於周末下午前往VR教育體驗館,選擇了\"極地探險\"遊戲,透過體感裝置體驗寒冷和震動。\n【00:00:34 - 00:02:00】: 故事背景設置,玩家需解開科學家R的神秘失蹤之謎,在極地嚴酷環境中尋找線索。期間介紹了南極的地理氣候和與英國的關係。\n【00:03:08 - 00:03:23】: 透過與店員互動回答來隱喻人工智慧語言模型的限制,並提示玩家搜集線索進行冒險。\n【00:04:41 - 00:05:03】: 發現北極熊的照片,玩家決定行動至北極,並提供了地形知識問答。\n【00:06:01 - 00:08:11】: 揭露科學家R被綁架的原因,並討論了北極的政治和經濟重要性。最後強調保護環境的重要性。\n\n5. 結論反思(為什麼我們要學這個?)\n通過這個故事情節,學習者不僅能夠體驗虛擬實境的樂趣,也能夠學習到關於地理、生態、以及環境保護的知識,提高對全球環境議題的認知和理解。它教導我們通過娛樂來學習如何關懷地球的未來,同時也啟發了對於科學研究與國際政治的基本認知。\n\n6. 延伸小問題\n- 你認為VR遊戲在教育上有哪些潛力?\n- 與南極相比,為什麼北極會成為各國政治和經濟角力的場所?\n- 全球暖化對極地動物的生存造成了哪些影響?我們能做些什麼來幫助改善這種情況? | |
""" | |
# 1. 內容類型 -> 針對 1. 內容類型:? 進行處理 | |
content_type = summary.split("1. 內容類型:")[1].split("\n")[0].strip() | |
# 2. 整體摘要 | |
overall_summary = summary.split("2. 整體摘要")[1].split("\n\n")[1].strip() | |
# 3. 條列式重點 | |
key_points = summary.split("3. 條列式重點")[1].split("\n\n")[1].strip() | |
# 4. 關鍵時刻(段落摘要) | |
key_moments = summary.split("4. 關鍵時刻(段落摘要)")[1].split("\n\n")[1].strip() | |
# 5. 結論反思(為什麼我們要學這個?) | |
conclusion_reflection = summary.split("5. 結論反思(為什麼我們要學這個?)")[1].split("\n\n")[1].strip() | |
# 6. 延伸小問題 | |
extension_questions = summary.split("6. 延伸小問題")[1].split("\n\n")[1].strip() | |
summary_json = { | |
"content_type": content_type, | |
"overall_summary": overall_summary, | |
"key_points": key_points, | |
"key_moments": key_moments, | |
"conclusion_reflection": conclusion_reflection, | |
"extension_questions": extension_questions | |
} | |
print("===processed_video_summary_to_json===") | |
print(summary_json) | |
print("===processed_video_summary_to_json===") | |
return summary_json | |
# get video_id_summary.json content | |
def get_video_id_summary(video_id, df_string): | |
print("===get_video_id_summary===") | |
service = init_drive_service() | |
parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL' | |
folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id) | |
file_name = f'{video_id}_summary.json' | |
# 检查逐字稿是否存在 | |
exists, file_id = check_file_exists(service, folder_id, file_name) | |
if not exists: | |
summary = generate_summarise(df_string) | |
# processed_summary = processed_video_summary_to_json(summary) | |
summary_json = {"summary": str(summary)} | |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2) | |
try: | |
upload_content_directly(service, file_name, folder_id, summary_text) | |
print("summary已上傳到Google Drive") | |
except Exception as e: | |
error_msg = f" {video_id} 摘要錯誤: {str(e)}" | |
print("===get_video_id_summary error===") | |
print(error_msg) | |
print("===get_video_id_summary error===") | |
# 存在 local at OUTPUT_PATH as {video_id}_summary.json | |
# with open(f'{OUTPUT_PATH}/{video_id}_summary.json', 'w') as f: | |
# f.write(summary_text) | |
# print(f"summary已存在 local at {OUTPUT_PATH}/{video_id}_summary.json") | |
# file_id = upload_file_directly(service, file_name, folder_id, f'{OUTPUT_PATH}/{video_id}_summary.json') | |
else: | |
# 逐字稿已存在,下载逐字稿内容 | |
print("summary已存在Google Drive中") | |
summary_text = download_file_as_string(service, file_id) | |
summary_json = json.loads(summary_text) | |
return summary_json | |
def generate_summarise(df_string): | |
# 使用 OpenAI 生成基于上传数据的问题 | |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW" | |
user_content = f""" | |
請根據 {df_string},判斷這份文本 | |
如果是資料類型,請提估欄位敘述、資料樣態與資料分析,告訴學生這張表的意義,以及可能的結論與對應方式 | |
如果是影片類型,請提估影片內容,告訴學生這部影片的意義, | |
小範圍切出不同段落的相對應時間軸的重點摘要,最多不超過五段 | |
注意不要遺漏任何一段時間軸的內容 | |
格式為 【start - end】: 摘要 | |
以及可能的結論與結尾延伸小問題提供學生作反思 | |
整體格式為: | |
🗂️ 1. 內容類型:? | |
📚 2. 整體摘要 | |
🔖 3. 條列式重點 | |
🔑 4. 關鍵時刻(段落摘要) | |
💡 5. 結論反思(為什麼我們要學這個?) | |
❓ 6. 延伸小問題 | |
""" | |
# 🗂️ 1. 內容類型:? | |
# 📚 2. 整體摘要 | |
# 🔖 3. 條列式重點 | |
# 🔑 4. 關鍵時刻(段落摘要) | |
# 💡 5. 結論反思(為什麼我們要學這個?) | |
# ❓ 6. 延伸小問題 | |
messages = [ | |
{"role": "system", "content": sys_content}, | |
{"role": "user", "content": user_content} | |
] | |
request_payload = { | |
"model": "gpt-4-turbo-preview", | |
"messages": messages, | |
"max_tokens": 4000, | |
} | |
response = client.chat.completions.create(**request_payload) | |
df_summarise = response.choices[0].message.content.strip() | |
print("=====df_summarise=====") | |
print(df_summarise) | |
print("=====df_summarise=====") | |
return df_summarise | |
def generate_questions(df_string): | |
# 使用 OpenAI 生成基于上传数据的问题 | |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,並用既有資料為本質猜測用戶可能會問的問題,使用 zh-TW" | |
user_content = f"請根據 {df_string} 生成三個問題,並用 JSON 格式返回 questions:[q1的敘述text, q2的敘述text, q3的敘述text]" | |
messages = [ | |
{"role": "system", "content": sys_content}, | |
{"role": "user", "content": user_content} | |
] | |
response_format = { "type": "json_object" } | |
print("=====messages=====") | |
print(messages) | |
print("=====messages=====") | |
request_payload = { | |
"model": "gpt-4-1106-preview", | |
"messages": messages, | |
"max_tokens": 4000, | |
"response_format": response_format | |
} | |
response = client.chat.completions.create(**request_payload) | |
questions = json.loads(response.choices[0].message.content)["questions"] | |
print("=====json_response=====") | |
print(questions) | |
print("=====json_response=====") | |
return questions | |
def get_questions(video_id, df_string): | |
# 去 g drive 確認是有有 video_id_questions.json | |
print("===get_questions===") | |
service = init_drive_service() | |
parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL' | |
folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id) | |
file_name = f'{video_id}_questions.json' | |
# 检查檔案是否存在 | |
exists, file_id = check_file_exists(service, folder_id, file_name) | |
if not exists: | |
questions = generate_questions(df_string) | |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2) | |
upload_content_directly(service, file_name, folder_id, questions_text) | |
print("questions已上傳到Google Drive") | |
else: | |
# 逐字稿已存在,下载逐字稿内容 | |
print("questions已存在于Google Drive中") | |
questions_text = download_file_as_string(service, file_id) | |
questions = json.loads(questions_text) | |
q1 = questions[0] if len(questions) > 0 else "" | |
q2 = questions[1] if len(questions) > 1 else "" | |
q3 = questions[2] if len(questions) > 2 else "" | |
print("=====get_questions=====") | |
print(f"q1: {q1}") | |
print(f"q2: {q2}") | |
print(f"q3: {q3}") | |
print("=====get_questions=====") | |
return q1, q2, q3 | |
def change_questions(df_string): | |
questions = generate_questions(df_string) | |
q1 = questions[0] if len(questions) > 0 else "" | |
q2 = questions[1] if len(questions) > 1 else "" | |
q3 = questions[2] if len(questions) > 2 else "" | |
print("=====get_questions=====") | |
print(f"q1: {q1}") | |
print(f"q2: {q2}") | |
print(f"q3: {q3}") | |
print("=====get_questions=====") | |
return q1, q2, q3 | |
def respond(user_message, df_string_output, chat_history, socratic_mode=False): | |
print("=== 變數:user_message ===") | |
print(user_message) | |
print("=== 變數:chat_history ===") | |
print(chat_history) | |
data = df_string_output | |
for entry in data: | |
entry.pop('embed_url', None) # Remove 'embed_url' if it exists | |
entry.pop('screenshot_path', None) | |
if socratic_mode: | |
sys_content = f""" | |
你是一個擅長資料分析跟影片教學的老師,user 為學生 | |
請用 {data} 為資料文本,自行判斷資料的種類, | |
並進行對話,使用 zh-TW | |
如果是影片類型,不用解釋逐字稿格式,直接回答學生問題 | |
請你用蘇格拉底式的提問方式,引導學生思考,並且給予學生一些提示 | |
不要直接給予答案,讓學生自己思考 | |
但可以給予一些提示跟引導,例如給予影片的時間軸,讓學生自己去找答案 | |
如果學生問了一些問題你無法判斷,請告訴學生你無法判斷,並建議學生可以問其他問題 | |
或者你可以問學生一些問題,幫助學生更好的理解資料 | |
如果學生的問題與資料文本無關,請告訴學生你無法回答超出範圍的問題 | |
最後,在你回答的開頭標註【蘇格拉底助教】 | |
""" | |
else: | |
sys_content = f""" | |
你是一個擅長資料分析跟影片教學的老師,user 為學生 | |
請用 {data} 為資料文本,自行判斷資料的種類, | |
並進行對話,使用 zh-TW | |
如果是影片類型,不用解釋逐字稿格式,直接回答學生問題 | |
但可以給予一些提示跟引導,例如給予影片的時間軸,讓學生可以找到相對應的時間點 | |
如果學生問了一些問題你無法判斷,請告訴學生你無法判斷,並建議學生可以問其他問題 | |
或者你可以問學生一些問題,幫助學生更好的理解資料 | |
如果學生的問題與資料文本無關,請告訴學生你無法回答超出範圍的問題 | |
""" | |
print("=== socratic_mode ===") | |
print(socratic_mode) | |
print("=== socratic_mode ===") | |
print("=== sys_content ===") | |
print(sys_content) | |
print("=== sys_content ===") | |
messages = [ | |
{"role": "system", "content": sys_content} | |
] | |
# if chat_history is not none, append role, content to messages | |
# chat_history = [(user, assistant), (user, assistant), ...] | |
# In the list, first one is user, then assistant | |
if chat_history is not None: | |
# 如果超過10則訊息,只保留最後10則訊息 | |
if len(chat_history) > 10: | |
chat_history = chat_history[-10:] | |
for chat in chat_history: | |
old_messages = [ | |
{"role": "user", "content": chat[0]}, | |
{"role": "assistant", "content": chat[1]} | |
] | |
messages += old_messages | |
else: | |
pass | |
messages.append({"role": "user", "content": user_message}) | |
print("=====messages=====") | |
print(messages) | |
print("=====messages=====") | |
request_payload = { | |
"model": "gpt-4-1106-preview", | |
"messages": messages, | |
"max_tokens": 4000 # 設定一個較大的值,可根據需要調整 | |
} | |
response = client.chat.completions.create(**request_payload) | |
print(response) | |
response_text = response.choices[0].message.content.strip() | |
# 更新聊天历史 | |
new_chat_history = (user_message, response_text) | |
if chat_history is None: | |
chat_history = [new_chat_history] | |
else: | |
chat_history.append(new_chat_history) | |
# 返回聊天历史和空字符串清空输入框 | |
return "", chat_history | |
def update_slide(direction): | |
global TRANSCRIPTS | |
global CURRENT_INDEX | |
print("=== 更新投影片 ===") | |
print(f"CURRENT_INDEX: {CURRENT_INDEX}") | |
# print(f"TRANSCRIPTS: {TRANSCRIPTS}") | |
CURRENT_INDEX += direction | |
if CURRENT_INDEX < 0: | |
CURRENT_INDEX = 0 # 防止索引小于0 | |
elif CURRENT_INDEX >= len(TRANSCRIPTS): | |
CURRENT_INDEX = len(TRANSCRIPTS) - 1 # 防止索引超出范围 | |
# 获取当前条目的文本和截图 URL | |
current_transcript = TRANSCRIPTS[CURRENT_INDEX] | |
slide_image = current_transcript["screenshot_path"] | |
slide_text = current_transcript["text"] | |
return slide_image, slide_text | |
def prev_slide(): | |
return update_slide(-1) | |
# 包装函数来处理 "下一个" 按钮点击事件 | |
def next_slide(): | |
return update_slide(1) | |
def get_video_id(): | |
return VIDEO_ID | |
HEAD = """ | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<style> | |
svg.markmap {{ | |
width: 100%; | |
height: 100vh; | |
}} | |
</style> | |
<script src="https://cdn.jsdelivr.net/npm/[email protected]"></script> | |
<script> | |
const mind_map_tab_button = document.querySelector("#mind_map_tab-button"); | |
if (mind_map_tab_button) { | |
mind_map_tab_button.addEventListener('click', function() { | |
const mind_map_markdown = document.querySelector("#mind_map_markdown > label > textarea"); | |
if (mind_map_markdown) { | |
// 当按钮被点击时,打印当前的textarea的值 | |
console.log('Value changed to: ' + mind_map_markdown.value); | |
markmap.autoLoader.renderAll(); | |
} | |
}); | |
} | |
</script> | |
""" | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(scale=2): | |
file_upload = gr.File(label="Upload your CSV or Word file", visible=False) | |
youtube_link = gr.Textbox(label="Enter YouTube Link") | |
youtube_link_btn = gr.Button("Submit_YouTube_Link") | |
web_link = gr.Textbox(label="Enter Web Page Link", visible=False) | |
chatbot = gr.Chatbot() | |
socratic_mode_btn = gr.Checkbox(label="蘇格拉底家教助理模式", value=False) | |
msg = gr.Textbox(label="Message") | |
send_button = gr.Button("Send") | |
with gr.Column(scale=3): | |
with gr.Tab("圖文"): | |
transcript_html = gr.HTML(label="YouTube Transcript and Video") | |
with gr.Tab("投影片"): | |
slide_image = gr.Image() | |
slide_text = gr.Textbox() | |
with gr.Row(): | |
prev_button = gr.Button("Previous") | |
next_button = gr.Button("Next") | |
prev_button.click(fn=prev_slide, inputs=[], outputs=[slide_image, slide_text]) | |
next_button.click(fn=next_slide, inputs=[], outputs=[slide_image, slide_text]) | |
with gr.Tab("逐字稿"): | |
simple_html_content = gr.HTML(label="Simple Transcript") | |
with gr.Tab("本文"): | |
df_string_output = gr.Textbox(lines=40, label="Data Text") | |
with gr.Tab("重點"): | |
df_summarise = gr.Textbox(container=True, show_copy_button=True, lines=40) | |
with gr.Tab("問題"): | |
gr.Markdown("## 常用問題") | |
btn_1 = gr.Button() | |
btn_2 = gr.Button() | |
btn_3 = gr.Button() | |
gr.Markdown("## 重新生成問題") | |
btn_create_question = gr.Button("Create Questions") | |
with gr.Tab("markdown"): | |
gr.Markdown("## 請複製以下 markdown 並貼到你的心智圖工具中,建議使用:https://markmap.js.org/repl") | |
mind_map = gr.Textbox(container=True, show_copy_button=True, lines=40, elem_id="mind_map_markdown") | |
with gr.Tab("心智圖",elem_id="mind_map_tab"): | |
mind_map_html = gr.HTML() | |
send_button.click( | |
respond, | |
inputs=[msg, df_string_output, chatbot, socratic_mode_btn], | |
outputs=[msg, chatbot] | |
) | |
# 连接按钮点击事件 | |
btn_1.click(respond, inputs=[btn_1, df_string_output, chatbot, socratic_mode_btn], outputs=[msg, chatbot]) | |
btn_2.click(respond, inputs=[btn_2, df_string_output, chatbot, socratic_mode_btn], outputs=[msg, chatbot]) | |
btn_3.click(respond, inputs=[btn_3, df_string_output, chatbot, socratic_mode_btn], outputs=[msg, chatbot]) | |
btn_create_question.click(change_questions, inputs = [df_string_output], outputs = [btn_1, btn_2, btn_3]) | |
# file_upload.change(process_file, inputs=file_upload, outputs=df_string_output) | |
file_upload.change(process_file, inputs=file_upload, outputs=[btn_1, btn_2, btn_3, df_summarise, df_string_output]) | |
# 当输入 YouTube 链接时触发 | |
youtube_link.change( | |
process_youtube_link, | |
inputs=youtube_link, | |
outputs=[ | |
btn_1, | |
btn_2, | |
btn_3, | |
df_string_output, | |
df_summarise, | |
mind_map, | |
mind_map_html, | |
transcript_html, | |
simple_html_content, | |
slide_image, | |
slide_text | |
] | |
) | |
youtube_link_btn.click( | |
process_youtube_link, | |
inputs=youtube_link, | |
outputs=[ | |
btn_1, | |
btn_2, | |
btn_3, | |
df_string_output, | |
df_summarise, | |
mind_map, | |
mind_map_html, | |
transcript_html, | |
simple_html_content, | |
slide_image, | |
slide_text | |
] | |
) | |
# 当输入网页链接时触发 | |
# web_link.change(process_web_link, inputs=web_link, outputs=[btn_1, btn_2, btn_3, df_summarise, df_string_output]) | |
demo.launch(allowed_paths=["videos"]) | |