Spaces:
Sleeping
Sleeping
check_file_exists
Browse files
app.py
CHANGED
@@ -19,23 +19,23 @@ from urllib.parse import urlparse, parse_qs
|
|
19 |
|
20 |
|
21 |
# 假设您的环境变量或Secret的名称是GOOGLE_APPLICATION_CREDENTIALS_JSON
|
22 |
-
credentials_json_string = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
23 |
-
credentials_dict = json.loads(credentials_json_string)
|
24 |
-
SCOPES = ['https://www.googleapis.com/auth/drive']
|
25 |
-
credentials = service_account.Credentials.from_service_account_info(
|
26 |
-
|
27 |
-
service = build('drive', 'v3', credentials=credentials)
|
28 |
-
# 列出 Google Drive 上的前10個文件
|
29 |
-
results = service.files().list(pageSize=10, fields="nextPageToken, files(id, name)").execute()
|
30 |
-
items = results.get('files', [])
|
31 |
-
|
32 |
-
if not items:
|
33 |
-
|
34 |
-
else:
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
|
40 |
|
41 |
|
@@ -45,6 +45,35 @@ OUTPUT_PATH = 'videos'
|
|
45 |
OPEN_AI_KEY = os.getenv("OPEN_AI_KEY")
|
46 |
client = OpenAI(api_key=OPEN_AI_KEY)
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def process_file(file):
|
49 |
# 读取文件
|
50 |
if file.name.endswith('.csv'):
|
@@ -106,9 +135,22 @@ def process_youtube_link(link):
|
|
106 |
# 使用 YouTube API 获取逐字稿
|
107 |
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
108 |
video_id = extract_youtube_id(link)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
-
# 先下載 video
|
111 |
-
download_youtube_video(video_id, output_path=OUTPUT_PATH)
|
112 |
# 再取得 transcript
|
113 |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW'])
|
114 |
# 基于逐字稿生成其他所需的输出
|
@@ -175,6 +217,8 @@ def download_youtube_video(youtube_id, output_path=OUTPUT_PATH):
|
|
175 |
|
176 |
|
177 |
def screenshot_youtube_video(youtube_id, snapshot_sec):
|
|
|
|
|
178 |
# 这里假设视频已经在适当的位置
|
179 |
video_path = f'{OUTPUT_PATH}/{youtube_id}.mp4'
|
180 |
|
@@ -185,24 +229,6 @@ def screenshot_youtube_video(youtube_id, snapshot_sec):
|
|
185 |
|
186 |
return screenshot_path
|
187 |
|
188 |
-
def process_video(youtube_id):
|
189 |
-
download_youtube_video(youtube_id)
|
190 |
-
video_path = f'{OUTPUT_PATH}/{youtube_id}.mp4'
|
191 |
-
video = VideoFileClip(video_path)
|
192 |
-
duration = int(video.duration)
|
193 |
-
output_path = f'{OUTPUT_PATH}/screenshots/{youtube_id}'
|
194 |
-
os.makedirs(output_path, exist_ok=True)
|
195 |
-
|
196 |
-
# fake duration
|
197 |
-
duration = 10
|
198 |
-
|
199 |
-
screenshot_paths = []
|
200 |
-
for i in range(1, duration):
|
201 |
-
screenshot_path = screenshot_youtube_video(youtube_id, i)
|
202 |
-
screenshot_paths.append(screenshot_path)
|
203 |
-
|
204 |
-
return screenshot_paths
|
205 |
-
|
206 |
def get_screenshot_from_video(video_link, start_time):
|
207 |
# 实现从视频中提取帧的逻辑
|
208 |
# 由于这需要服务器端处理,你可能需要一种方法来下载视频,
|
|
|
19 |
|
20 |
|
21 |
# 假设您的环境变量或Secret的名称是GOOGLE_APPLICATION_CREDENTIALS_JSON
|
22 |
+
# credentials_json_string = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
23 |
+
# credentials_dict = json.loads(credentials_json_string)
|
24 |
+
# SCOPES = ['https://www.googleapis.com/auth/drive']
|
25 |
+
# credentials = service_account.Credentials.from_service_account_info(
|
26 |
+
# credentials_dict, scopes=SCOPES)
|
27 |
+
# service = build('drive', 'v3', credentials=credentials)
|
28 |
+
# # 列出 Google Drive 上的前10個文件
|
29 |
+
# results = service.files().list(pageSize=10, fields="nextPageToken, files(id, name)").execute()
|
30 |
+
# items = results.get('files', [])
|
31 |
+
|
32 |
+
# if not items:
|
33 |
+
# print('No files found.')
|
34 |
+
# else:
|
35 |
+
# print("=====Google Drive 上的前10個文件=====")
|
36 |
+
# print('Files:')
|
37 |
+
# for item in items:
|
38 |
+
# print(u'{0} ({1})'.format(item['name'], item['id']))
|
39 |
|
40 |
|
41 |
|
|
|
45 |
OPEN_AI_KEY = os.getenv("OPEN_AI_KEY")
|
46 |
client = OpenAI(api_key=OPEN_AI_KEY)
|
47 |
|
48 |
+
# 初始化Google Drive服务
|
49 |
+
def init_drive_service():
|
50 |
+
credentials_json_string = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
51 |
+
credentials_dict = json.loads(credentials_json_string)
|
52 |
+
SCOPES = ['https://www.googleapis.com/auth/drive']
|
53 |
+
credentials = service_account.Credentials.from_service_account_info(
|
54 |
+
credentials_dict, scopes=SCOPES)
|
55 |
+
service = build('drive', 'v3', credentials=credentials)
|
56 |
+
return service
|
57 |
+
|
58 |
+
# 检查Google Drive上是否存在文件
|
59 |
+
def check_file_exists(service, folder_name, file_name):
|
60 |
+
query = f"name = '{file_name}' and '{folder_name}' in parents and trashed = false"
|
61 |
+
response = service.files().list(q=query).execute()
|
62 |
+
files = response.get('files', [])
|
63 |
+
return len(files) > 0, files[0]['id'] if files else None
|
64 |
+
|
65 |
+
# 上传文件到Google Drive
|
66 |
+
def upload_to_drive(service, file_name, folder_id, content):
|
67 |
+
file_metadata = {
|
68 |
+
'name': file_name,
|
69 |
+
'parents': [folder_id]
|
70 |
+
}
|
71 |
+
media = MediaIoBaseUpload(io.BytesIO(content.encode()), mimetype='text/plain')
|
72 |
+
file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
73 |
+
return file.get('id')
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
def process_file(file):
|
78 |
# 读取文件
|
79 |
if file.name.endswith('.csv'):
|
|
|
135 |
# 使用 YouTube API 获取逐字稿
|
136 |
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
137 |
video_id = extract_youtube_id(link)
|
138 |
+
service = init_drive_service()
|
139 |
+
folder_name = 'youtube逐字稿圖檔/{video_id}' # Google Drive上的文件夹ID
|
140 |
+
file_name = f"{video_id}_transcript.txt"
|
141 |
+
|
142 |
+
# 检查逐字稿是否存在
|
143 |
+
exists, file_id = check_file_exists(service, folder_name, file_name)
|
144 |
+
if not exists:
|
145 |
+
# 获取逐字稿
|
146 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
|
147 |
+
transcript_text = "\n".join([f"{item['start']}: {item['text']}" for item in transcript])
|
148 |
+
# 上传到Google Drive
|
149 |
+
upload_to_drive(service, file_name, folder_name, transcript_text)
|
150 |
+
print("逐字稿已上传到Google Drive")
|
151 |
+
else:
|
152 |
+
print("逐字稿已存在于Google Drive中")
|
153 |
|
|
|
|
|
154 |
# 再取得 transcript
|
155 |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW'])
|
156 |
# 基于逐字稿生成其他所需的输出
|
|
|
217 |
|
218 |
|
219 |
def screenshot_youtube_video(youtube_id, snapshot_sec):
|
220 |
+
# 先下載 video
|
221 |
+
download_youtube_video(youtube_id, output_path=OUTPUT_PATH)
|
222 |
# 这里假设视频已经在适当的位置
|
223 |
video_path = f'{OUTPUT_PATH}/{youtube_id}.mp4'
|
224 |
|
|
|
229 |
|
230 |
return screenshot_path
|
231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
def get_screenshot_from_video(video_link, start_time):
|
233 |
# 实现从视频中提取帧的逻辑
|
234 |
# 由于这需要服务器端处理,你可能需要一种方法来下载视频,
|