Upload main.py
Browse files
main.py
CHANGED
@@ -132,35 +132,6 @@ class CivitAICrawler:
|
|
132 |
logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
|
133 |
return {}
|
134 |
|
135 |
-
def download_model_files(self, model_versions: list, folder: str):
|
136 |
-
"""最新のモデルバージョンと古いバージョンのファイルをまとめてダウンロード."""
|
137 |
-
for version in model_versions:
|
138 |
-
files_info = version.get("files", [])
|
139 |
-
for file_info in files_info:
|
140 |
-
download_url = file_info["downloadUrl"]
|
141 |
-
file_name = file_info["name"]
|
142 |
-
login_detected_count = 0
|
143 |
-
|
144 |
-
while login_detected_count < 5:
|
145 |
-
local_path = self.download_file(download_url, folder, file_name)
|
146 |
-
if local_path and "login" in os.listdir(folder):
|
147 |
-
# 万が一、ダウンロード先に "login" という謎ファイルが出た場合の再試行処理
|
148 |
-
login_detected_count += 1
|
149 |
-
os.remove(os.path.join(folder, "login"))
|
150 |
-
logger.warning(f"Detected 'login' file, retrying download: {file_name} ({login_detected_count}/5)")
|
151 |
-
else:
|
152 |
-
break
|
153 |
-
|
154 |
-
if login_detected_count >= 5:
|
155 |
-
# ダウンロード失敗を示すダミーファイルを作成
|
156 |
-
dummy_file_path = os.path.join(folder, f"{file_name}.download_failed")
|
157 |
-
try:
|
158 |
-
with open(dummy_file_path, "w") as f:
|
159 |
-
f.write("Download failed after 5 attempts.")
|
160 |
-
logger.error(f"Failed to download {file_name}. Created dummy file: {dummy_file_path}")
|
161 |
-
except Exception as e:
|
162 |
-
logger.error(f"Failed to create dummy file for {file_name}: {e}")
|
163 |
-
|
164 |
def download_images(self, model_versions: list, folder: str):
|
165 |
"""画像を images フォルダにまとめてダウンロードする."""
|
166 |
images_folder = os.path.join(folder, "images")
|
@@ -203,20 +174,93 @@ class CivitAICrawler:
|
|
203 |
logger.info(f"Saved model_info.json: {info_path}")
|
204 |
except Exception as e:
|
205 |
logger.error(f"Failed to save model info JSON: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]:
|
212 |
-
"""
|
213 |
-
1. /home/user/app/encrypted 配下の古いファイルやフォルダを消去(オプション)
|
214 |
-
2. rclone copy local_folder => cryptLocal:subfolder_label
|
215 |
-
→ /home/user/app/encrypted/subfolder_label/ が必ずフォルダとして作られる
|
216 |
-
3. そのフォルダを upload_folder()
|
217 |
-
4. ローカル平文フォルダ & 暗号フォルダを削除
|
218 |
-
5. 最後に subfolder_label を return
|
219 |
-
"""
|
220 |
if not os.path.exists(local_folder):
|
221 |
logger.error(f"encrypt_and_upload_folder: folder not found: {local_folder}")
|
222 |
return None
|
@@ -224,8 +268,7 @@ class CivitAICrawler:
|
|
224 |
encrypted_base_dir = os.path.join(os.getcwd(), "encrypted")
|
225 |
os.makedirs(encrypted_base_dir, exist_ok=True)
|
226 |
|
227 |
-
#
|
228 |
-
# これにより "/home/user/app/encrypted" 配下を毎回スッキリさせる
|
229 |
for item in os.listdir(encrypted_base_dir):
|
230 |
item_path = os.path.join(encrypted_base_dir, item)
|
231 |
try:
|
@@ -236,12 +279,8 @@ class CivitAICrawler:
|
|
236 |
logger.info(f"[CLEANUP] Removed old encrypted item: {item_path}")
|
237 |
except Exception as e:
|
238 |
logger.warning(f"[CLEANUP] Failed to remove {item_path}: {e}")
|
239 |
-
# --------------------------------------
|
240 |
|
241 |
-
# 今回は "enc_{UUID}" のようなサブフォルダ名を決める
|
242 |
subfolder_label = "enc_" + str(uuid.uuid4())[:8]
|
243 |
-
|
244 |
-
# rclone で確実に "enc_XXXX" フォルダが生成される
|
245 |
try:
|
246 |
subprocess.run(
|
247 |
["rclone", "copy", local_folder, f"cryptLocal:{subfolder_label}"],
|
@@ -253,17 +292,17 @@ class CivitAICrawler:
|
|
253 |
|
254 |
enc_folder_path = os.path.join(encrypted_base_dir, subfolder_label)
|
255 |
if not os.path.isdir(enc_folder_path):
|
256 |
-
logger.error(f"[ERROR] {enc_folder_path} is not a directory.
|
257 |
return None
|
258 |
|
259 |
-
#
|
260 |
try:
|
261 |
self.upload_folder(enc_folder_path, path_in_repo=subfolder_label)
|
262 |
logger.info(f"Uploaded encrypted folder: {enc_folder_path}")
|
263 |
except Exception as e:
|
264 |
logger.error(f"Failed to upload encrypted folder {enc_folder_path}: {e}")
|
265 |
|
266 |
-
#
|
267 |
try:
|
268 |
shutil.rmtree(local_folder)
|
269 |
shutil.rmtree(enc_folder_path)
|
@@ -310,6 +349,43 @@ class CivitAICrawler:
|
|
310 |
logger.error(f"Failed after {max_retries} attempts: {e}")
|
311 |
raise
|
312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
def upload_folder(self, folder_path: str, path_in_repo: Optional[str] = None):
|
314 |
"""
|
315 |
フォルダを Hugging Face リポジトリに一括アップロード
|
@@ -403,36 +479,30 @@ class CivitAICrawler:
|
|
403 |
logger.warning(f"No modelVersions found for ID {model_id}.")
|
404 |
return
|
405 |
|
406 |
-
#
|
407 |
-
# たとえばモデル名をベースにフォルダを作る(被り防止にUUIDを付与)
|
408 |
folder_name = model_info.get("name", "UnknownModel")
|
409 |
-
folder_name = re.sub(r'[\\/*?:"<>|]', '_', folder_name)
|
410 |
folder_name += "_" + str(uuid.uuid4())[:8]
|
411 |
os.makedirs(folder_name, exist_ok=True)
|
412 |
|
413 |
-
#
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
self.download_images(versions, folder_name)
|
418 |
-
|
419 |
-
# HTMLを取得
|
420 |
-
model_page_url = f"{self.config.URLS['modelPage']}{model_id}"
|
421 |
-
self.save_html_content(model_page_url, folder_name)
|
422 |
-
|
423 |
-
# model_info.json保存
|
424 |
self.save_model_info_json(model_info, folder_name)
|
425 |
|
426 |
-
#
|
|
|
|
|
|
|
427 |
enc_subfolder = self.encrypt_and_upload_folder(folder_name)
|
428 |
if enc_subfolder is None:
|
429 |
enc_subfolder = "[ENCRYPT_FAILED]"
|
430 |
|
431 |
hf_enc_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{enc_subfolder}"
|
432 |
-
|
433 |
-
# model_list.logに追記 (実際の暗号フォルダ名をそのまま書き込む)
|
434 |
with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
|
435 |
-
# 例: "Pudu chileno (ID:12345): https://huggingface.co/xxx/tree/main/<暗号フォルダ>"
|
436 |
f.write(f"{model_info.get('name', 'UnnamedModel')} (ID:{model_id}): {hf_enc_url}\n")
|
437 |
|
438 |
except Exception as e:
|
|
|
132 |
logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
|
133 |
return {}
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
def download_images(self, model_versions: list, folder: str):
|
136 |
"""画像を images フォルダにまとめてダウンロードする."""
|
137 |
images_folder = os.path.join(folder, "images")
|
|
|
174 |
logger.info(f"Saved model_info.json: {info_path}")
|
175 |
except Exception as e:
|
176 |
logger.error(f"Failed to save model info JSON: {e}")
|
177 |
+
# =========================================================================
|
178 |
+
# ここが重要:
|
179 |
+
# - 最新バージョンはまとめて folder_name にダウンロード (一度に暗号化アップロード)
|
180 |
+
# - 古いバージョンは1つずつダウンロード→暗号化→アップロード→削除 でストレージを節約
|
181 |
+
# =========================================================================
|
182 |
+
def download_and_process_versions(self, model_versions: list, folder: str):
|
183 |
+
"""最新バージョンをまとめてダウンロード、old_versionsは1つずつアップして削除。"""
|
184 |
+
|
185 |
+
# 1) 最新バージョン (インデックス0) のファイルを folder にダウンロード
|
186 |
+
latest_version = model_versions[0]
|
187 |
+
logger.info(f"Processing latest version: {latest_version.get('name','(NoName)')}")
|
188 |
+
|
189 |
+
for file_info in latest_version.get("files", []):
|
190 |
+
download_url = file_info["downloadUrl"]
|
191 |
+
file_name = file_info["name"]
|
192 |
+
login_detected_count = 0
|
193 |
+
|
194 |
+
while login_detected_count < 5:
|
195 |
+
local_path = self.download_file(download_url, folder, file_name)
|
196 |
+
if local_path and "login" in os.listdir(folder):
|
197 |
+
login_detected_count += 1
|
198 |
+
os.remove(os.path.join(folder, "login"))
|
199 |
+
logger.warning(f"'login' file found, retrying {file_name} ({login_detected_count}/5)")
|
200 |
+
else:
|
201 |
+
break
|
202 |
+
|
203 |
+
if login_detected_count >= 5:
|
204 |
+
dummy_file_path = os.path.join(folder, f"{file_name}.download_failed")
|
205 |
+
try:
|
206 |
+
with open(dummy_file_path, "w") as f:
|
207 |
+
f.write("Download failed after 5 attempts.")
|
208 |
+
logger.error(f"Failed to download {file_name}. Dummy file created: {dummy_file_path}")
|
209 |
+
except Exception as e:
|
210 |
+
logger.error(f"Failed to create dummy file for {file_name}: {e}")
|
211 |
+
|
212 |
+
# 2) 古いバージョンがあれば 1つずつダウンロード→暗号化アップロード→削除
|
213 |
+
if len(model_versions) > 1:
|
214 |
+
old_versions_folder = os.path.join(folder, "old_versions")
|
215 |
+
os.makedirs(old_versions_folder, exist_ok=True)
|
216 |
+
|
217 |
+
for version in model_versions[1:]:
|
218 |
+
logger.info(f"Processing older version: {version.get('name','(NoName)')}")
|
219 |
+
for file_info in version.get("files", []):
|
220 |
+
file_name = file_info["name"]
|
221 |
+
download_url = file_info["downloadUrl"]
|
222 |
+
login_detected_count = 0
|
223 |
+
|
224 |
+
while login_detected_count < 5:
|
225 |
+
local_path = self.download_file(download_url, old_versions_folder, file_name)
|
226 |
+
if local_path and "login" in os.listdir(old_versions_folder):
|
227 |
+
login_detected_count += 1
|
228 |
+
os.remove(os.path.join(old_versions_folder, "login"))
|
229 |
+
logger.warning(f"'login' file found, retrying {file_name} ({login_detected_count}/5)")
|
230 |
+
else:
|
231 |
+
break
|
232 |
|
233 |
+
if login_detected_count >= 5:
|
234 |
+
dummy_file_path = os.path.join(old_versions_folder, f"{file_name}.download_failed")
|
235 |
+
try:
|
236 |
+
with open(dummy_file_path, "w") as f:
|
237 |
+
f.write("Download failed after 5 attempts.")
|
238 |
+
logger.error(f"Failed to download {file_name}. Dummy file: {dummy_file_path}")
|
239 |
+
except Exception as e:
|
240 |
+
logger.error(f"Failed to create dummy file for {file_name}: {e}")
|
241 |
+
continue # 次のファイルへ
|
242 |
+
|
243 |
+
# ===== ダウンロード成功した古いバージョンファイルを暗号化アップロード =====
|
244 |
+
# フォルダごと暗号化でもいいですが、大容量を避けるためファイル単位で暗号化する例
|
245 |
+
# ここでは "encrypt_and_upload_folder" の代わりに「encrypt_and_upload_file」するなど
|
246 |
+
# あるいはフォルダごとでもOK
|
247 |
+
single_file_folder = os.path.join(old_versions_folder, "temp_single")
|
248 |
+
os.makedirs(single_file_folder, exist_ok=True)
|
249 |
+
try:
|
250 |
+
# 移動して「このファイルだけ」が入ったフォルダを作る
|
251 |
+
single_file_path = shutil.move(local_path, os.path.join(single_file_folder, file_name))
|
252 |
+
# 暗号化アップロード
|
253 |
+
self.encrypt_and_upload_folder(single_file_folder)
|
254 |
+
except Exception as e:
|
255 |
+
logger.error(f"Failed to encrypt/upload old version file: {e}")
|
256 |
+
finally:
|
257 |
+
# single_file_folder削除(encrypt_and_upload_folderで消えるはず)
|
258 |
+
if os.path.exists(single_file_folder):
|
259 |
+
shutil.rmtree(single_file_folder)
|
260 |
+
logger.info(f"Removed temp_single folder {single_file_folder}")
|
261 |
+
|
262 |
+
# ========== 既存の encrypt_and_upload_folder はそのまま活用 ==========
|
263 |
def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
if not os.path.exists(local_folder):
|
265 |
logger.error(f"encrypt_and_upload_folder: folder not found: {local_folder}")
|
266 |
return None
|
|
|
268 |
encrypted_base_dir = os.path.join(os.getcwd(), "encrypted")
|
269 |
os.makedirs(encrypted_base_dir, exist_ok=True)
|
270 |
|
271 |
+
# 前のコードにある「古い暗号ファイル削除」はそのまま残す
|
|
|
272 |
for item in os.listdir(encrypted_base_dir):
|
273 |
item_path = os.path.join(encrypted_base_dir, item)
|
274 |
try:
|
|
|
279 |
logger.info(f"[CLEANUP] Removed old encrypted item: {item_path}")
|
280 |
except Exception as e:
|
281 |
logger.warning(f"[CLEANUP] Failed to remove {item_path}: {e}")
|
|
|
282 |
|
|
|
283 |
subfolder_label = "enc_" + str(uuid.uuid4())[:8]
|
|
|
|
|
284 |
try:
|
285 |
subprocess.run(
|
286 |
["rclone", "copy", local_folder, f"cryptLocal:{subfolder_label}"],
|
|
|
292 |
|
293 |
enc_folder_path = os.path.join(encrypted_base_dir, subfolder_label)
|
294 |
if not os.path.isdir(enc_folder_path):
|
295 |
+
logger.error(f"[ERROR] {enc_folder_path} is not a directory.")
|
296 |
return None
|
297 |
|
298 |
+
# アップロード
|
299 |
try:
|
300 |
self.upload_folder(enc_folder_path, path_in_repo=subfolder_label)
|
301 |
logger.info(f"Uploaded encrypted folder: {enc_folder_path}")
|
302 |
except Exception as e:
|
303 |
logger.error(f"Failed to upload encrypted folder {enc_folder_path}: {e}")
|
304 |
|
305 |
+
# ローカル削除
|
306 |
try:
|
307 |
shutil.rmtree(local_folder)
|
308 |
shutil.rmtree(enc_folder_path)
|
|
|
349 |
logger.error(f"Failed after {max_retries} attempts: {e}")
|
350 |
raise
|
351 |
|
352 |
+
def upload_file(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
|
353 |
+
if repo_id is None:
|
354 |
+
repo_id = self.repo_ids['current']
|
355 |
+
if path_in_repo is None:
|
356 |
+
path_in_repo = os.path.basename(file_path)
|
357 |
+
|
358 |
+
max_retries = 5
|
359 |
+
attempt = 0
|
360 |
+
while attempt < max_retries:
|
361 |
+
try:
|
362 |
+
self.api.upload_file(
|
363 |
+
path_or_fileobj=file_path,
|
364 |
+
repo_id=repo_id,
|
365 |
+
path_in_repo=path_in_repo
|
366 |
+
)
|
367 |
+
logger.info(f"Uploaded file: {file_path} to {repo_id} at {path_in_repo}")
|
368 |
+
return
|
369 |
+
except Exception as e:
|
370 |
+
attempt += 1
|
371 |
+
error_message = str(e)
|
372 |
+
if "over the limit of 100000 files" in error_message:
|
373 |
+
logger.warning("File limit exceeded, creating a new repo.")
|
374 |
+
self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
|
375 |
+
self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
|
376 |
+
attempt = 0
|
377 |
+
continue
|
378 |
+
elif "you can retry this action in about 1 hour" in error_message:
|
379 |
+
logger.warning("Rate limit hit. Waiting 1 hour...")
|
380 |
+
time.sleep(3600)
|
381 |
+
attempt -= 1
|
382 |
+
else:
|
383 |
+
if attempt < max_retries:
|
384 |
+
logger.warning(f"Failed to upload {file_path}, retry {attempt}/{max_retries}")
|
385 |
+
else:
|
386 |
+
logger.error(f"Failed after {max_retries} attempts: {e}")
|
387 |
+
raise
|
388 |
+
|
389 |
def upload_folder(self, folder_path: str, path_in_repo: Optional[str] = None):
|
390 |
"""
|
391 |
フォルダを Hugging Face リポジトリに一括アップロード
|
|
|
479 |
logger.warning(f"No modelVersions found for ID {model_id}.")
|
480 |
return
|
481 |
|
482 |
+
# ベースとなるフォルダ名
|
|
|
483 |
folder_name = model_info.get("name", "UnknownModel")
|
484 |
+
folder_name = re.sub(r'[\\/*?:"<>|]', '_', folder_name)
|
485 |
folder_name += "_" + str(uuid.uuid4())[:8]
|
486 |
os.makedirs(folder_name, exist_ok=True)
|
487 |
|
488 |
+
# (A) 最新バージョンファイル + (B) 画像 + (C) HTML + model_info.json
|
489 |
+
# → 全部 folder_name に置く
|
490 |
+
# → 後でまとめて encrypt_and_upload_folder
|
491 |
+
self.download_and_process_versions(versions, folder_name)
|
492 |
self.download_images(versions, folder_name)
|
493 |
+
self.save_html_content(f"{self.config.URLS['modelPage']}{model_id}", folder_name)
|
|
|
|
|
|
|
|
|
|
|
494 |
self.save_model_info_json(model_info, folder_name)
|
495 |
|
496 |
+
# ↑ この時点で「最新バージョン」の大ファイルが folder_name に残っている
|
497 |
+
# しかし old_versions は既に1つずつ暗号化→削除済み
|
498 |
+
|
499 |
+
# まとめて暗号化 & アップロード
|
500 |
enc_subfolder = self.encrypt_and_upload_folder(folder_name)
|
501 |
if enc_subfolder is None:
|
502 |
enc_subfolder = "[ENCRYPT_FAILED]"
|
503 |
|
504 |
hf_enc_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{enc_subfolder}"
|
|
|
|
|
505 |
with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
|
|
|
506 |
f.write(f"{model_info.get('name', 'UnnamedModel')} (ID:{model_id}): {hf_enc_url}\n")
|
507 |
|
508 |
except Exception as e:
|