ttttdiva commited on
Commit
9b752dd
·
verified ·
1 Parent(s): aa3f601

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +137 -67
main.py CHANGED
@@ -132,35 +132,6 @@ class CivitAICrawler:
132
  logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
133
  return {}
134
 
135
- def download_model_files(self, model_versions: list, folder: str):
136
- """最新のモデルバージョンと古いバージョンのファイルをまとめてダウンロード."""
137
- for version in model_versions:
138
- files_info = version.get("files", [])
139
- for file_info in files_info:
140
- download_url = file_info["downloadUrl"]
141
- file_name = file_info["name"]
142
- login_detected_count = 0
143
-
144
- while login_detected_count < 5:
145
- local_path = self.download_file(download_url, folder, file_name)
146
- if local_path and "login" in os.listdir(folder):
147
- # 万が一、ダウンロード先に "login" という謎ファイルが出た場合の再試行処理
148
- login_detected_count += 1
149
- os.remove(os.path.join(folder, "login"))
150
- logger.warning(f"Detected 'login' file, retrying download: {file_name} ({login_detected_count}/5)")
151
- else:
152
- break
153
-
154
- if login_detected_count >= 5:
155
- # ダウンロード失敗を示すダミーファイルを作成
156
- dummy_file_path = os.path.join(folder, f"{file_name}.download_failed")
157
- try:
158
- with open(dummy_file_path, "w") as f:
159
- f.write("Download failed after 5 attempts.")
160
- logger.error(f"Failed to download {file_name}. Created dummy file: {dummy_file_path}")
161
- except Exception as e:
162
- logger.error(f"Failed to create dummy file for {file_name}: {e}")
163
-
164
  def download_images(self, model_versions: list, folder: str):
165
  """画像を images フォルダにまとめてダウンロードする."""
166
  images_folder = os.path.join(folder, "images")
@@ -203,20 +174,93 @@ class CivitAICrawler:
203
  logger.info(f"Saved model_info.json: {info_path}")
204
  except Exception as e:
205
  logger.error(f"Failed to save model info JSON: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
-
208
- # =====================================================
209
- # 必ずサブフォルダを作り、古いファイルやフォルダは削除してからコピー
210
- # =====================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]:
212
- """
213
- 1. /home/user/app/encrypted 配下の古いファイルやフォルダを消去(オプション)
214
- 2. rclone copy local_folder => cryptLocal:subfolder_label
215
- → /home/user/app/encrypted/subfolder_label/ が必ずフォルダとして作られる
216
- 3. そのフォルダを upload_folder()
217
- 4. ローカル平文フォルダ & 暗号フォルダを削除
218
- 5. 最後に subfolder_label を return
219
- """
220
  if not os.path.exists(local_folder):
221
  logger.error(f"encrypt_and_upload_folder: folder not found: {local_folder}")
222
  return None
@@ -224,8 +268,7 @@ class CivitAICrawler:
224
  encrypted_base_dir = os.path.join(os.getcwd(), "encrypted")
225
  os.makedirs(encrypted_base_dir, exist_ok=True)
226
 
227
- # --- 古い暗号ファイルやフォルダを消す例 ---
228
- # これにより "/home/user/app/encrypted" 配下を毎回スッキリさせる
229
  for item in os.listdir(encrypted_base_dir):
230
  item_path = os.path.join(encrypted_base_dir, item)
231
  try:
@@ -236,12 +279,8 @@ class CivitAICrawler:
236
  logger.info(f"[CLEANUP] Removed old encrypted item: {item_path}")
237
  except Exception as e:
238
  logger.warning(f"[CLEANUP] Failed to remove {item_path}: {e}")
239
- # --------------------------------------
240
 
241
- # 今回は "enc_{UUID}" のようなサブフォルダ名を決める
242
  subfolder_label = "enc_" + str(uuid.uuid4())[:8]
243
-
244
- # rclone で確実に "enc_XXXX" フォルダが生成される
245
  try:
246
  subprocess.run(
247
  ["rclone", "copy", local_folder, f"cryptLocal:{subfolder_label}"],
@@ -253,17 +292,17 @@ class CivitAICrawler:
253
 
254
  enc_folder_path = os.path.join(encrypted_base_dir, subfolder_label)
255
  if not os.path.isdir(enc_folder_path):
256
- logger.error(f"[ERROR] {enc_folder_path} is not a directory. Rclone must have produced a file instead.")
257
  return None
258
 
259
- # フォルダをアップロード
260
  try:
261
  self.upload_folder(enc_folder_path, path_in_repo=subfolder_label)
262
  logger.info(f"Uploaded encrypted folder: {enc_folder_path}")
263
  except Exception as e:
264
  logger.error(f"Failed to upload encrypted folder {enc_folder_path}: {e}")
265
 
266
- # ローカル平文フォルダと暗号フォルダを削除
267
  try:
268
  shutil.rmtree(local_folder)
269
  shutil.rmtree(enc_folder_path)
@@ -310,6 +349,43 @@ class CivitAICrawler:
310
  logger.error(f"Failed after {max_retries} attempts: {e}")
311
  raise
312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  def upload_folder(self, folder_path: str, path_in_repo: Optional[str] = None):
314
  """
315
  フォルダを Hugging Face リポジトリに一括アップロード
@@ -403,36 +479,30 @@ class CivitAICrawler:
403
  logger.warning(f"No modelVersions found for ID {model_id}.")
404
  return
405
 
406
- # フォルダ名として適当な名前をつける
407
- # たとえばモデル名をベースにフォルダを作る(被り防止にUUIDを付与)
408
  folder_name = model_info.get("name", "UnknownModel")
409
- folder_name = re.sub(r'[\\/*?:"<>|]', '_', folder_name) # OSで使えない文字除去
410
  folder_name += "_" + str(uuid.uuid4())[:8]
411
  os.makedirs(folder_name, exist_ok=True)
412
 
413
- # モデルファイルをダウンロード
414
- self.download_model_files(versions, folder_name)
415
-
416
- # 画像を images/ にダウンロード
417
  self.download_images(versions, folder_name)
418
-
419
- # HTMLを取得
420
- model_page_url = f"{self.config.URLS['modelPage']}{model_id}"
421
- self.save_html_content(model_page_url, folder_name)
422
-
423
- # model_info.json保存
424
  self.save_model_info_json(model_info, folder_name)
425
 
426
- # === フォルダごと暗号化 & アップロード
 
 
 
427
  enc_subfolder = self.encrypt_and_upload_folder(folder_name)
428
  if enc_subfolder is None:
429
  enc_subfolder = "[ENCRYPT_FAILED]"
430
 
431
  hf_enc_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{enc_subfolder}"
432
-
433
- # model_list.logに追記 (実際の暗号フォルダ名をそのまま書き込む)
434
  with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
435
- # 例: "Pudu chileno (ID:12345): https://huggingface.co/xxx/tree/main/<暗号フォルダ>"
436
  f.write(f"{model_info.get('name', 'UnnamedModel')} (ID:{model_id}): {hf_enc_url}\n")
437
 
438
  except Exception as e:
 
132
  logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
133
  return {}
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  def download_images(self, model_versions: list, folder: str):
136
  """画像を images フォルダにまとめてダウンロードする."""
137
  images_folder = os.path.join(folder, "images")
 
174
  logger.info(f"Saved model_info.json: {info_path}")
175
  except Exception as e:
176
  logger.error(f"Failed to save model info JSON: {e}")
177
+ # =========================================================================
178
+ # ここが重要:
179
+ # - 最新バージョンはまとめて folder_name にダウンロード (一度に暗号化アップロード)
180
+ # - 古いバージョンは1つずつダウンロード→暗号化→アップロード→削除 でストレージを節約
181
+ # =========================================================================
182
+ def download_and_process_versions(self, model_versions: list, folder: str):
183
+ """最新バージョンをまとめてダウンロード、old_versionsは1つずつアップして削除。"""
184
+
185
+ # 1) 最新バージョン (インデックス0) のファイルを folder にダウンロード
186
+ latest_version = model_versions[0]
187
+ logger.info(f"Processing latest version: {latest_version.get('name','(NoName)')}")
188
+
189
+ for file_info in latest_version.get("files", []):
190
+ download_url = file_info["downloadUrl"]
191
+ file_name = file_info["name"]
192
+ login_detected_count = 0
193
+
194
+ while login_detected_count < 5:
195
+ local_path = self.download_file(download_url, folder, file_name)
196
+ if local_path and "login" in os.listdir(folder):
197
+ login_detected_count += 1
198
+ os.remove(os.path.join(folder, "login"))
199
+ logger.warning(f"'login' file found, retrying {file_name} ({login_detected_count}/5)")
200
+ else:
201
+ break
202
+
203
+ if login_detected_count >= 5:
204
+ dummy_file_path = os.path.join(folder, f"{file_name}.download_failed")
205
+ try:
206
+ with open(dummy_file_path, "w") as f:
207
+ f.write("Download failed after 5 attempts.")
208
+ logger.error(f"Failed to download {file_name}. Dummy file created: {dummy_file_path}")
209
+ except Exception as e:
210
+ logger.error(f"Failed to create dummy file for {file_name}: {e}")
211
+
212
+ # 2) 古いバージョンがあれば 1つずつダウンロード→暗号化アップロード→削除
213
+ if len(model_versions) > 1:
214
+ old_versions_folder = os.path.join(folder, "old_versions")
215
+ os.makedirs(old_versions_folder, exist_ok=True)
216
+
217
+ for version in model_versions[1:]:
218
+ logger.info(f"Processing older version: {version.get('name','(NoName)')}")
219
+ for file_info in version.get("files", []):
220
+ file_name = file_info["name"]
221
+ download_url = file_info["downloadUrl"]
222
+ login_detected_count = 0
223
+
224
+ while login_detected_count < 5:
225
+ local_path = self.download_file(download_url, old_versions_folder, file_name)
226
+ if local_path and "login" in os.listdir(old_versions_folder):
227
+ login_detected_count += 1
228
+ os.remove(os.path.join(old_versions_folder, "login"))
229
+ logger.warning(f"'login' file found, retrying {file_name} ({login_detected_count}/5)")
230
+ else:
231
+ break
232
 
233
+ if login_detected_count >= 5:
234
+ dummy_file_path = os.path.join(old_versions_folder, f"{file_name}.download_failed")
235
+ try:
236
+ with open(dummy_file_path, "w") as f:
237
+ f.write("Download failed after 5 attempts.")
238
+ logger.error(f"Failed to download {file_name}. Dummy file: {dummy_file_path}")
239
+ except Exception as e:
240
+ logger.error(f"Failed to create dummy file for {file_name}: {e}")
241
+ continue # 次のファイルへ
242
+
243
+ # ===== ダウンロード成功した古いバージョンファイルを暗号化アップロード =====
244
+ # フォルダごと暗号化でもいいですが、大容量を避けるためファイル単位で暗号化する例
245
+ # ここでは "encrypt_and_upload_folder" の代わりに「encrypt_and_upload_file」するなど
246
+ # あるいはフォルダごとでもOK
247
+ single_file_folder = os.path.join(old_versions_folder, "temp_single")
248
+ os.makedirs(single_file_folder, exist_ok=True)
249
+ try:
250
+ # 移動して「このファイルだけ」が入ったフォルダを作る
251
+ single_file_path = shutil.move(local_path, os.path.join(single_file_folder, file_name))
252
+ # 暗号化アップロード
253
+ self.encrypt_and_upload_folder(single_file_folder)
254
+ except Exception as e:
255
+ logger.error(f"Failed to encrypt/upload old version file: {e}")
256
+ finally:
257
+ # single_file_folder削除(encrypt_and_upload_folderで消えるはず)
258
+ if os.path.exists(single_file_folder):
259
+ shutil.rmtree(single_file_folder)
260
+ logger.info(f"Removed temp_single folder {single_file_folder}")
261
+
262
+ # ========== 既存の encrypt_and_upload_folder はそのまま活用 ==========
263
  def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]:
 
 
 
 
 
 
 
 
264
  if not os.path.exists(local_folder):
265
  logger.error(f"encrypt_and_upload_folder: folder not found: {local_folder}")
266
  return None
 
268
  encrypted_base_dir = os.path.join(os.getcwd(), "encrypted")
269
  os.makedirs(encrypted_base_dir, exist_ok=True)
270
 
271
+ # 前のコードにある「古い暗号ファイル削除」はそのまま残す
 
272
  for item in os.listdir(encrypted_base_dir):
273
  item_path = os.path.join(encrypted_base_dir, item)
274
  try:
 
279
  logger.info(f"[CLEANUP] Removed old encrypted item: {item_path}")
280
  except Exception as e:
281
  logger.warning(f"[CLEANUP] Failed to remove {item_path}: {e}")
 
282
 
 
283
  subfolder_label = "enc_" + str(uuid.uuid4())[:8]
 
 
284
  try:
285
  subprocess.run(
286
  ["rclone", "copy", local_folder, f"cryptLocal:{subfolder_label}"],
 
292
 
293
  enc_folder_path = os.path.join(encrypted_base_dir, subfolder_label)
294
  if not os.path.isdir(enc_folder_path):
295
+ logger.error(f"[ERROR] {enc_folder_path} is not a directory.")
296
  return None
297
 
298
+ # アップロード
299
  try:
300
  self.upload_folder(enc_folder_path, path_in_repo=subfolder_label)
301
  logger.info(f"Uploaded encrypted folder: {enc_folder_path}")
302
  except Exception as e:
303
  logger.error(f"Failed to upload encrypted folder {enc_folder_path}: {e}")
304
 
305
+ # ローカル削除
306
  try:
307
  shutil.rmtree(local_folder)
308
  shutil.rmtree(enc_folder_path)
 
349
  logger.error(f"Failed after {max_retries} attempts: {e}")
350
  raise
351
 
352
+ def upload_file(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
353
+ if repo_id is None:
354
+ repo_id = self.repo_ids['current']
355
+ if path_in_repo is None:
356
+ path_in_repo = os.path.basename(file_path)
357
+
358
+ max_retries = 5
359
+ attempt = 0
360
+ while attempt < max_retries:
361
+ try:
362
+ self.api.upload_file(
363
+ path_or_fileobj=file_path,
364
+ repo_id=repo_id,
365
+ path_in_repo=path_in_repo
366
+ )
367
+ logger.info(f"Uploaded file: {file_path} to {repo_id} at {path_in_repo}")
368
+ return
369
+ except Exception as e:
370
+ attempt += 1
371
+ error_message = str(e)
372
+ if "over the limit of 100000 files" in error_message:
373
+ logger.warning("File limit exceeded, creating a new repo.")
374
+ self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
375
+ self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
376
+ attempt = 0
377
+ continue
378
+ elif "you can retry this action in about 1 hour" in error_message:
379
+ logger.warning("Rate limit hit. Waiting 1 hour...")
380
+ time.sleep(3600)
381
+ attempt -= 1
382
+ else:
383
+ if attempt < max_retries:
384
+ logger.warning(f"Failed to upload {file_path}, retry {attempt}/{max_retries}")
385
+ else:
386
+ logger.error(f"Failed after {max_retries} attempts: {e}")
387
+ raise
388
+
389
  def upload_folder(self, folder_path: str, path_in_repo: Optional[str] = None):
390
  """
391
  フォルダを Hugging Face リポジトリに一括アップロード
 
479
  logger.warning(f"No modelVersions found for ID {model_id}.")
480
  return
481
 
482
+ # ベースとなるフォルダ名
 
483
  folder_name = model_info.get("name", "UnknownModel")
484
+ folder_name = re.sub(r'[\\/*?:"<>|]', '_', folder_name)
485
  folder_name += "_" + str(uuid.uuid4())[:8]
486
  os.makedirs(folder_name, exist_ok=True)
487
 
488
+ # (A) 最新バージョンファイル + (B) 画像 + (C) HTML + model_info.json
489
+ # → 全部 folder_name に置く
490
+ # → 後でまとめて encrypt_and_upload_folder
491
+ self.download_and_process_versions(versions, folder_name)
492
  self.download_images(versions, folder_name)
493
+ self.save_html_content(f"{self.config.URLS['modelPage']}{model_id}", folder_name)
 
 
 
 
 
494
  self.save_model_info_json(model_info, folder_name)
495
 
496
+ # この時点で「最新バージョン」の大ファイルが folder_name に残っている
497
+ # しかし old_versions は既に1つずつ暗号化→削除済み
498
+
499
+ # まとめて暗号化 & アップロード
500
  enc_subfolder = self.encrypt_and_upload_folder(folder_name)
501
  if enc_subfolder is None:
502
  enc_subfolder = "[ENCRYPT_FAILED]"
503
 
504
  hf_enc_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{enc_subfolder}"
 
 
505
  with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
 
506
  f.write(f"{model_info.get('name', 'UnnamedModel')} (ID:{model_id}): {hf_enc_url}\n")
507
 
508
  except Exception as e: