ttttdiva commited on
Commit
6b12d98
·
verified ·
1 Parent(s): 7e2c78b

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +47 -255
main.py CHANGED
@@ -56,7 +56,6 @@ class CivitAICrawler:
56
 
57
  rclone_conf_base64 = os.environ.get("RCLONE_CONF_BASE64")
58
  if rclone_conf_base64:
59
- # カレントディレクトリ配下に .rclone_config ディレクトリを作成
60
  config_dir = os.path.join(os.getcwd(), ".rclone_config")
61
  os.makedirs(config_dir, exist_ok=True)
62
 
@@ -64,7 +63,6 @@ class CivitAICrawler:
64
  with open(conf_path, "wb") as f:
65
  f.write(base64.b64decode(rclone_conf_base64))
66
 
67
- # rclone がここを参照するように設定
68
  os.environ["RCLONE_CONFIG"] = conf_path
69
  logger.info(f"[INFO] Created rclone.conf at {conf_path}")
70
  else:
@@ -115,7 +113,6 @@ class CivitAICrawler:
115
  filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name)
116
  file_path = os.path.join(destination_folder, filename)
117
 
118
- # ダウンロードとファイル保存処理
119
  with open(file_path, 'wb') as file:
120
  for chunk in response.iter_content(chunk_size=8192):
121
  file.write(chunk)
@@ -136,10 +133,12 @@ class CivitAICrawler:
136
  """画像を images フォルダにまとめてダウンロードする."""
137
  images_folder = os.path.join(folder, "images")
138
  os.makedirs(images_folder, exist_ok=True)
 
139
  images = []
140
  for version in model_versions:
141
  for img in version.get("images", []):
142
  images.append(img["url"])
 
143
  for image_url in images:
144
  image_name = os.path.basename(image_url)
145
  local_path = os.path.join(images_folder, image_name)
@@ -158,7 +157,7 @@ class CivitAICrawler:
158
  try:
159
  resp = requests.get(model_page_url)
160
  resp.raise_for_status()
161
- html_path = os.path.join(folder, "page.html")
162
  with open(html_path, 'w', encoding='utf-8') as f:
163
  f.write(resp.text)
164
  logger.info(f"Saved HTML: {html_path}")
@@ -166,7 +165,7 @@ class CivitAICrawler:
166
  logger.error(f"Error saving HTML content from {model_page_url}: {e}")
167
 
168
  def save_model_info_json(self, model_info: dict, folder: str):
169
- """モデル情報をJSONファイルとして保存."""
170
  info_path = os.path.join(folder, "model_info.json")
171
  try:
172
  with open(info_path, 'w', encoding='utf-8') as f:
@@ -174,202 +173,50 @@ class CivitAICrawler:
174
  logger.info(f"Saved model_info.json: {info_path}")
175
  except Exception as e:
176
  logger.error(f"Failed to save model info JSON: {e}")
177
- # =========================================================================
178
- # ここが重要:
179
- # - 最新バージョンはまとめて folder_name にダウンロード (一度に暗号化アップロード)
180
- # - 古いバージョンは1つずつダウンロード→暗号化→アップロード→削除 でストレージを節約
181
- # =========================================================================
182
  def download_and_process_versions(self, model_versions: list, folder: str):
183
- """最新バージョンをまとめてダウンロード、old_versionsは1つずつアップして削除。"""
 
 
 
184
 
185
- # 1) 最新バージョン (インデックス0) のファイルを folder にダウンロード
186
  latest_version = model_versions[0]
187
- logger.info(f"Processing latest version: {latest_version.get('name','(NoName)')}")
188
-
189
  for file_info in latest_version.get("files", []):
190
  download_url = file_info["downloadUrl"]
191
  file_name = file_info["name"]
192
- login_detected_count = 0
193
-
194
- while login_detected_count < 5:
195
- local_path = self.download_file(download_url, folder, file_name)
196
- if local_path and "login" in os.listdir(folder):
197
- login_detected_count += 1
198
- os.remove(os.path.join(folder, "login"))
199
- logger.warning(f"'login' file found, retrying {file_name} ({login_detected_count}/5)")
200
- else:
201
- break
202
-
203
- if login_detected_count >= 5:
204
- dummy_file_path = os.path.join(folder, f"{file_name}.download_failed")
205
- try:
206
- with open(dummy_file_path, "w") as f:
207
- f.write("Download failed after 5 attempts.")
208
- logger.error(f"Failed to download {file_name}. Dummy file created: {dummy_file_path}")
209
- except Exception as e:
210
- logger.error(f"Failed to create dummy file for {file_name}: {e}")
211
-
212
- # 2) 古いバージ��ンがあれば 1つずつダウンロード→暗号化アップロード→削除
213
  if len(model_versions) > 1:
214
  old_versions_folder = os.path.join(folder, "old_versions")
215
  os.makedirs(old_versions_folder, exist_ok=True)
216
 
217
  for version in model_versions[1:]:
218
- logger.info(f"Processing older version: {version.get('name','(NoName)')}")
219
  for file_info in version.get("files", []):
220
  file_name = file_info["name"]
221
  download_url = file_info["downloadUrl"]
222
- login_detected_count = 0
223
-
224
- while login_detected_count < 5:
225
- local_path = self.download_file(download_url, old_versions_folder, file_name)
226
- if local_path and "login" in os.listdir(old_versions_folder):
227
- login_detected_count += 1
228
- os.remove(os.path.join(old_versions_folder, "login"))
229
- logger.warning(f"'login' file found, retrying {file_name} ({login_detected_count}/5)")
230
- else:
231
- break
232
-
233
- if login_detected_count >= 5:
234
- dummy_file_path = os.path.join(old_versions_folder, f"{file_name}.download_failed")
235
- try:
236
- with open(dummy_file_path, "w") as f:
237
- f.write("Download failed after 5 attempts.")
238
- logger.error(f"Failed to download {file_name}. Dummy file: {dummy_file_path}")
239
- except Exception as e:
240
- logger.error(f"Failed to create dummy file for {file_name}: {e}")
241
- continue # 次のファイルへ
242
-
243
- # ===== ダウンロード成功した古いバージョンファイルを暗号化アップロード =====
244
- # フォルダごと暗号化でもいいですが、大容量を避けるためファイル単位で暗号化する例
245
- # ここでは "encrypt_and_upload_folder" の代わりに「encrypt_and_upload_file」するなど
246
- # あるいはフォルダごとでもOK
247
- single_file_folder = os.path.join(old_versions_folder, "temp_single")
248
- os.makedirs(single_file_folder, exist_ok=True)
249
- try:
250
- single_file_path = shutil.move(local_path, os.path.join(single_file_folder, file_name))
251
-
252
- # ★ここでファイルのみを暗号化したいので encrypt_and_upload_single_file() を使用
253
- self.encrypt_and_upload_single_file(os.path.join(single_file_folder, file_name))
254
- except Exception as e:
255
- logger.error(f"Failed to encrypt/upload old version file: {e}")
256
- finally:
257
- # single_file_folder削除(encrypt_and_upload_folderで消えるはず)
258
- if os.path.exists(single_file_folder):
259
- shutil.rmtree(single_file_folder)
260
- logger.info(f"Removed temp_single folder {single_file_folder}")
261
-
262
- def encrypt_and_upload_single_file(self, local_file: str) -> Optional[str]:
263
- """
264
- 単一ファイルを暗号化→アップロード→ローカル削除する。
265
- - rclone mkdir cryptLocal:enc_xxxx
266
- - rclone copyto local_file => cryptLocal:enc_xxxx/filename
267
- - 差分検知で作られた暗号フォルダを発見→upload_folder()→削除
268
- """
269
- import uuid
270
- if not os.path.isfile(local_file):
271
- logger.error(f"[encrypt_and_upload_single_file] Not a file: {local_file}")
272
- return None
273
-
274
- encrypted_base_dir = os.path.join(os.getcwd(), "encrypted")
275
- os.makedirs(encrypted_base_dir, exist_ok=True)
276
-
277
- # 古い暗号物を削除
278
- before_dirs = set(os.listdir(encrypted_base_dir))
279
- for item in before_dirs:
280
- item_path = os.path.join(encrypted_base_dir, item)
281
- try:
282
- if os.path.isfile(item_path):
283
- os.remove(item_path)
284
- else:
285
- shutil.rmtree(item_path)
286
- logger.info(f"[CLEANUP] Removed old encrypted item: {item_path}")
287
- except Exception as e:
288
- logger.warning(f"[CLEANUP] Failed to remove {item_path}: {e}")
289
-
290
- # 1) mkdir
291
- subfolder_label = "enc_" + str(uuid.uuid4())[:8]
292
- try:
293
- subprocess.run(
294
- ["rclone", "mkdir", f"cryptLocal:{subfolder_label}"],
295
- check=True
296
- )
297
- logger.info(f"[OK] rclone mkdir cryptLocal:{subfolder_label}")
298
- except subprocess.CalledProcessError as e:
299
- logger.error(f"rclone mkdir failed: {e}")
300
- return None
301
-
302
- # 2) copyto (ファイル) => cryptLocal:enc_xxx/filename
303
- filename_in_repo = os.path.basename(local_file)
304
- try:
305
- subprocess.run(
306
- [
307
- "rclone", "copyto",
308
- local_file,
309
- f"cryptLocal:{subfolder_label}/{filename_in_repo}",
310
- "--create-empty-src-dirs"
311
- ],
312
- check=True
313
- )
314
- logger.info(f"[OK] rclone copyto {local_file} => cryptLocal:{subfolder_label}/{filename_in_repo}")
315
- except subprocess.CalledProcessError as e:
316
- logger.error(f"rclone copyto failed: {e}")
317
- return None
318
-
319
- # 3) 差分検知
320
- after_dirs = set(os.listdir(encrypted_base_dir))
321
- diff = after_dirs - before_dirs
322
- if not diff:
323
- logger.error("[ERROR] No new directory appeared in ./encrypted after rclone copyto.")
324
- return None
325
- if len(diff) > 1:
326
- logger.warning(f"[WARN] Multiple new directories found in ./encrypted: {diff}")
327
-
328
- enc_folder_name = diff.pop() # 1つ抜き取る
329
- enc_folder_path = os.path.join(encrypted_base_dir, enc_folder_name)
330
- if not os.path.isdir(enc_folder_path):
331
- logger.error(f"[ERROR] {enc_folder_path} is not a directory.")
332
- return None
333
-
334
- # 4) upload_folder
335
- try:
336
- self.upload_folder(enc_folder_path, path_in_repo=subfolder_label)
337
- logger.info(f"[OK] Uploaded encrypted folder: {enc_folder_path}")
338
- except Exception as e:
339
- logger.error(f"Failed to upload encrypted folder {enc_folder_path}: {e}")
340
-
341
- # 5) ローカル削除(暗号フォルダ + 元ファイル)
342
- try:
343
- if os.path.exists(local_file):
344
- os.remove(local_file)
345
- if os.path.isdir(enc_folder_path):
346
- shutil.rmtree(enc_folder_path)
347
- logger.info(f"[CLEANUP] Removed {local_file} and {enc_folder_path}")
348
- except Exception as e:
349
- logger.warning(f"[CLEANUP] Failed to remove local items: {e}")
350
-
351
- return subfolder_label
352
 
353
  def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]:
354
  """
355
- 1. rclone mkdir cryptLocal:subfolder_label で空フォルダを作る
356
- 2. rclone copy local_folder => cryptLocal:subfolder_label --create-empty-src-dirs
357
- 3. そのフォルダを self.upload_folder() でアップ
358
- 4. ローカル平文フォルダ & 暗号フォルダを削除
359
- 5. subfolder_label を返す
360
  """
361
- if not os.path.exists(local_folder):
362
- logger.error(f"encrypt_and_upload_folder: folder not found: {local_folder}")
363
  return None
364
 
365
  encrypted_base_dir = os.path.join(os.getcwd(), "encrypted")
366
  os.makedirs(encrypted_base_dir, exist_ok=True)
367
 
368
- # 既存の暗号ファイル削除(不要ならコメントアウト)
369
  for item in os.listdir(encrypted_base_dir):
370
  item_path = os.path.join(encrypted_base_dir, item)
371
  try:
372
- if os.path.isfile(item_path) or os.path.islink(item_path):
373
  os.remove(item_path)
374
  else:
375
  shutil.rmtree(item_path)
@@ -377,46 +224,48 @@ class CivitAICrawler:
377
  except Exception as e:
378
  logger.warning(f"[CLEANUP] Failed to remove {item_path}: {e}")
379
 
380
- # サブフォルダ名を生成(enc_ + UUID)
381
  subfolder_label = "enc_" + str(uuid.uuid4())[:8]
382
 
383
- # ★ 追加: mkdir で先に空ディレクトリを作っておく
384
  try:
385
- subprocess.run(
386
- ["rclone", "mkdir", f"cryptLocal:{subfolder_label}"],
387
- check=True
388
- )
389
  except subprocess.CalledProcessError as e:
390
  logger.error(f"rclone mkdir failed: {e}")
391
  return None
392
 
393
- # ★ --create-empty-src-dirs オプションを付けて copy
394
  try:
395
  subprocess.run(
396
  [
397
- "rclone",
398
- "copy",
399
- local_folder,
400
  f"cryptLocal:{subfolder_label}",
401
  "--create-empty-src-dirs"
402
  ],
403
  check=True
404
  )
 
405
  except subprocess.CalledProcessError as e:
406
  logger.error(f"rclone copy failed: {e}")
407
  return None
408
 
409
- # 暗号フォルダのパス(必ずディレクトリができている前提)
410
- enc_folder_path = os.path.join(encrypted_base_dir, subfolder_label)
 
 
 
 
 
411
 
412
  if not os.path.isdir(enc_folder_path):
413
- logger.error(f"[ERROR] {enc_folder_path} is not a directory, something is still off.")
414
  return None
415
 
416
- # アップロード
417
  try:
418
  self.upload_folder(enc_folder_path, path_in_repo=subfolder_label)
419
- logger.info(f"Uploaded encrypted folder: {enc_folder_path}")
420
  except Exception as e:
421
  logger.error(f"Failed to upload encrypted folder {enc_folder_path}: {e}")
422
 
@@ -424,9 +273,9 @@ class CivitAICrawler:
424
  try:
425
  shutil.rmtree(local_folder)
426
  shutil.rmtree(enc_folder_path)
427
- logger.info(f"Removed local folder: {local_folder} and {enc_folder_path}")
428
  except Exception as e:
429
- logger.error(f"Failed to remove local folders: {e}")
430
 
431
  return subfolder_label
432
 
@@ -467,47 +316,7 @@ class CivitAICrawler:
467
  logger.error(f"Failed after {max_retries} attempts: {e}")
468
  raise
469
 
470
- def upload_file(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
471
- if repo_id is None:
472
- repo_id = self.repo_ids['current']
473
- if path_in_repo is None:
474
- path_in_repo = os.path.basename(file_path)
475
-
476
- max_retries = 5
477
- attempt = 0
478
- while attempt < max_retries:
479
- try:
480
- self.api.upload_file(
481
- path_or_fileobj=file_path,
482
- repo_id=repo_id,
483
- path_in_repo=path_in_repo
484
- )
485
- logger.info(f"Uploaded file: {file_path} to {repo_id} at {path_in_repo}")
486
- return
487
- except Exception as e:
488
- attempt += 1
489
- error_message = str(e)
490
- if "over the limit of 100000 files" in error_message:
491
- logger.warning("File limit exceeded, creating a new repo.")
492
- self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
493
- self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
494
- attempt = 0
495
- continue
496
- elif "you can retry this action in about 1 hour" in error_message:
497
- logger.warning("Rate limit hit. Waiting 1 hour...")
498
- time.sleep(3600)
499
- attempt -= 1
500
- else:
501
- if attempt < max_retries:
502
- logger.warning(f"Failed to upload {file_path}, retry {attempt}/{max_retries}")
503
- else:
504
- logger.error(f"Failed after {max_retries} attempts: {e}")
505
- raise
506
-
507
  def upload_folder(self, folder_path: str, path_in_repo: Optional[str] = None):
508
- """
509
- フォルダを Hugging Face リポジトリに一括アップロード
510
- """
511
  if path_in_repo is None:
512
  path_in_repo = os.path.basename(folder_path)
513
 
@@ -544,7 +353,6 @@ class CivitAICrawler:
544
 
545
  @staticmethod
546
  def increment_repo_name(repo_id: str) -> str:
547
- """リポジトリ名の末尾の数字をインクリメントする。"""
548
  match = re.search(r'(\d+)$', repo_id)
549
  if match:
550
  number = int(match.group(1)) + 1
@@ -554,7 +362,6 @@ class CivitAICrawler:
554
  return new_repo_id
555
 
556
  def read_model_list(self) -> dict:
557
- """モデルリストを読み込む。"""
558
  model_list = {}
559
  try:
560
  with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
@@ -570,7 +377,6 @@ class CivitAICrawler:
570
  return model_list
571
 
572
  def get_repo_info(self, repo_id):
573
- """リポジトリの情報を取得する。"""
574
  try:
575
  repo_info = self.api.repo_info(repo_id=repo_id, files_metadata=True)
576
  file_paths = [sibling.rfilename for sibling in repo_info.siblings]
@@ -580,41 +386,31 @@ class CivitAICrawler:
580
  return []
581
 
582
  def process_model(self, model_url: str):
583
- """1つのモデルをダウンロードしてフォルダ丸ごと暗号化&アップロードする."""
584
  try:
585
- # model_idを取得
586
  model_id = model_url.rstrip("/").split("/")[-1]
587
-
588
- # モデル情報を取得
589
  model_info = self.get_model_info(model_id)
590
  if not model_info or "modelVersions" not in model_info:
591
  logger.error(f"No valid model info for ID {model_id}. Skipping.")
592
  return
593
 
594
- # バージョン一覧
595
  versions = model_info["modelVersions"]
596
  if not versions:
597
  logger.warning(f"No modelVersions found for ID {model_id}.")
598
  return
599
 
600
- # ベースとなるフォルダ名
601
- folder_name = model_info.get("name", "UnknownModel")
602
  folder_name = re.sub(r'[\\/*?:"<>|]', '_', folder_name)
603
  folder_name += "_" + str(uuid.uuid4())[:8]
604
  os.makedirs(folder_name, exist_ok=True)
605
 
606
- # (A) 最新バージョンファイル + (B) 画像 + (C) HTML + model_info.json
607
- # → 全部 folder_name に置く
608
- # → 後でまとめて encrypt_and_upload_folder
609
  self.download_and_process_versions(versions, folder_name)
610
  self.download_images(versions, folder_name)
611
  self.save_html_content(f"{self.config.URLS['modelPage']}{model_id}", folder_name)
612
  self.save_model_info_json(model_info, folder_name)
613
 
614
- # ↑ この時点で「最新バージョン」の大ファイルが folder_name に残っている
615
- # しかし old_versions は既に1つずつ暗号化→削除済み
616
-
617
- # まとめて暗号化 & アップロード
618
  enc_subfolder = self.encrypt_and_upload_folder(folder_name)
619
  if enc_subfolder is None:
620
  enc_subfolder = "[ENCRYPT_FAILED]"
@@ -627,19 +423,18 @@ class CivitAICrawler:
627
  logger.error(f"Error in process_model ({model_url}): {e}")
628
 
629
  async def crawl(self):
630
- """モデルを定期的にチェックし、更新を行う。"""
631
  while True:
632
  try:
633
  login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
634
 
635
- # model_list.logを最新化
636
  model_list_path = hf_hub_download(
637
  repo_id=self.repo_ids['model_list'],
638
  filename=self.config.LIST_FILE
639
  )
640
  shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
641
 
642
- # ログファイルを最新化
643
  local_file_path = hf_hub_download(
644
  repo_id=self.repo_ids["log"],
645
  filename=self.config.LOG_FILE
@@ -674,14 +469,12 @@ class CivitAICrawler:
674
  else:
675
  await asyncio.sleep(2)
676
 
677
- # 新モデルをold_modelsに追加し、ログを更新
678
  old_models.append(model_id)
679
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
680
  f.write(json.dumps(old_models) + "\n")
681
  f.write(f"{self.repo_ids['current']}\n")
682
  logger.info(f"Updated log with new model ID: {model_id}")
683
 
684
- # ログファイル & model_list.logをアップロード
685
  self.upload_file(
686
  file_path=self.config.LOG_FILE,
687
  repo_id=self.repo_ids["log"],
@@ -693,7 +486,6 @@ class CivitAICrawler:
693
  path_in_repo=self.config.LIST_FILE
694
  )
695
  else:
696
- # 新着なし → ログを最新化してアップロードだけして待機
697
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
698
  f.write(json.dumps(latest_model_ids) + "\n")
699
  f.write(f"{self.repo_ids['current']}\n")
 
56
 
57
  rclone_conf_base64 = os.environ.get("RCLONE_CONF_BASE64")
58
  if rclone_conf_base64:
 
59
  config_dir = os.path.join(os.getcwd(), ".rclone_config")
60
  os.makedirs(config_dir, exist_ok=True)
61
 
 
63
  with open(conf_path, "wb") as f:
64
  f.write(base64.b64decode(rclone_conf_base64))
65
 
 
66
  os.environ["RCLONE_CONFIG"] = conf_path
67
  logger.info(f"[INFO] Created rclone.conf at {conf_path}")
68
  else:
 
113
  filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name)
114
  file_path = os.path.join(destination_folder, filename)
115
 
 
116
  with open(file_path, 'wb') as file:
117
  for chunk in response.iter_content(chunk_size=8192):
118
  file.write(chunk)
 
133
  """画像を images フォルダにまとめてダウンロードする."""
134
  images_folder = os.path.join(folder, "images")
135
  os.makedirs(images_folder, exist_ok=True)
136
+
137
  images = []
138
  for version in model_versions:
139
  for img in version.get("images", []):
140
  images.append(img["url"])
141
+
142
  for image_url in images:
143
  image_name = os.path.basename(image_url)
144
  local_path = os.path.join(images_folder, image_name)
 
157
  try:
158
  resp = requests.get(model_page_url)
159
  resp.raise_for_status()
160
+ html_path = os.path.join(folder, f"{os.path.basename(folder)}.html")
161
  with open(html_path, 'w', encoding='utf-8') as f:
162
  f.write(resp.text)
163
  logger.info(f"Saved HTML: {html_path}")
 
165
  logger.error(f"Error saving HTML content from {model_page_url}: {e}")
166
 
167
  def save_model_info_json(self, model_info: dict, folder: str):
168
+ """モデル情報(json)の保存"""
169
  info_path = os.path.join(folder, "model_info.json")
170
  try:
171
  with open(info_path, 'w', encoding='utf-8') as f:
 
173
  logger.info(f"Saved model_info.json: {info_path}")
174
  except Exception as e:
175
  logger.error(f"Failed to save model info JSON: {e}")
176
+
 
 
 
 
177
  def download_and_process_versions(self, model_versions: list, folder: str):
178
+ """
179
+ 最新バージョン + 古いバージョンをすべて1つのフォルダにダウンロードして
180
+ 最後にまとめて暗号化アップロードする。
181
+ """
182
 
183
+ # 1) 最新バージョンは folder
184
  latest_version = model_versions[0]
 
 
185
  for file_info in latest_version.get("files", []):
186
  download_url = file_info["downloadUrl"]
187
  file_name = file_info["name"]
188
+ local_path = self.download_file(download_url, folder, file_name)
189
+ # リトライ処理など省略
190
+
191
+ # 2) 古いバージョンを "folder/old_versions" にまとめる
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  if len(model_versions) > 1:
193
  old_versions_folder = os.path.join(folder, "old_versions")
194
  os.makedirs(old_versions_folder, exist_ok=True)
195
 
196
  for version in model_versions[1:]:
 
197
  for file_info in version.get("files", []):
198
  file_name = file_info["name"]
199
  download_url = file_info["downloadUrl"]
200
+ local_path = self.download_file(download_url, old_versions_folder, file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]:
203
  """
204
+ 1. rclone mkdir cryptLocal:subfolder_label
205
+ 2. rclone copy local_folder => cryptLocal:subfolder_label
206
+ 3. upload_folder() → ローカル削除
 
 
207
  """
208
+ if not os.path.isdir(local_folder):
209
+ logger.error(f"encrypt_and_upload_folder: {local_folder} is not a directory.")
210
  return None
211
 
212
  encrypted_base_dir = os.path.join(os.getcwd(), "encrypted")
213
  os.makedirs(encrypted_base_dir, exist_ok=True)
214
 
215
+ # 既存の暗号物を削除
216
  for item in os.listdir(encrypted_base_dir):
217
  item_path = os.path.join(encrypted_base_dir, item)
218
  try:
219
+ if os.path.isfile(item_path):
220
  os.remove(item_path)
221
  else:
222
  shutil.rmtree(item_path)
 
224
  except Exception as e:
225
  logger.warning(f"[CLEANUP] Failed to remove {item_path}: {e}")
226
 
 
227
  subfolder_label = "enc_" + str(uuid.uuid4())[:8]
228
 
229
+ # mkdir
230
  try:
231
+ subprocess.run(["rclone", "mkdir", f"cryptLocal:{subfolder_label}"], check=True)
232
+ logger.info(f"[OK] rclone mkdir cryptLocal:{subfolder_label}")
 
 
233
  except subprocess.CalledProcessError as e:
234
  logger.error(f"rclone mkdir failed: {e}")
235
  return None
236
 
237
+ # copy
238
  try:
239
  subprocess.run(
240
  [
241
+ "rclone", "copy",
242
+ local_folder,
 
243
  f"cryptLocal:{subfolder_label}",
244
  "--create-empty-src-dirs"
245
  ],
246
  check=True
247
  )
248
+ logger.info(f"[OK] rclone copy {local_folder} => cryptLocal:{subfolder_label}")
249
  except subprocess.CalledProcessError as e:
250
  logger.error(f"rclone copy failed: {e}")
251
  return None
252
 
253
+ # 差分検知
254
+ after_list = os.listdir(encrypted_base_dir)
255
+ if not after_list:
256
+ logger.error("[ERROR] No new directory in ./encrypted after rclone copy.")
257
+ return None
258
+ enc_folder_name = after_list[0] # 先頭を使う
259
+ enc_folder_path = os.path.join(encrypted_base_dir, enc_folder_name)
260
 
261
  if not os.path.isdir(enc_folder_path):
262
+ logger.error(f"[ERROR] {enc_folder_path} is not a directory.")
263
  return None
264
 
265
+ # upload_folder
266
  try:
267
  self.upload_folder(enc_folder_path, path_in_repo=subfolder_label)
268
+ logger.info(f"[OK] Uploaded encrypted folder: {enc_folder_path}")
269
  except Exception as e:
270
  logger.error(f"Failed to upload encrypted folder {enc_folder_path}: {e}")
271
 
 
273
  try:
274
  shutil.rmtree(local_folder)
275
  shutil.rmtree(enc_folder_path)
276
+ logger.info(f"[CLEANUP] Removed local folder: {local_folder} & {enc_folder_path}")
277
  except Exception as e:
278
+ logger.warning(f"[CLEANUP] Could not remove local folders: {e}")
279
 
280
  return subfolder_label
281
 
 
316
  logger.error(f"Failed after {max_retries} attempts: {e}")
317
  raise
318
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  def upload_folder(self, folder_path: str, path_in_repo: Optional[str] = None):
 
 
 
320
  if path_in_repo is None:
321
  path_in_repo = os.path.basename(folder_path)
322
 
 
353
 
354
  @staticmethod
355
  def increment_repo_name(repo_id: str) -> str:
 
356
  match = re.search(r'(\d+)$', repo_id)
357
  if match:
358
  number = int(match.group(1)) + 1
 
362
  return new_repo_id
363
 
364
  def read_model_list(self) -> dict:
 
365
  model_list = {}
366
  try:
367
  with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
 
377
  return model_list
378
 
379
  def get_repo_info(self, repo_id):
 
380
  try:
381
  repo_info = self.api.repo_info(repo_id=repo_id, files_metadata=True)
382
  file_paths = [sibling.rfilename for sibling in repo_info.siblings]
 
386
  return []
387
 
388
  def process_model(self, model_url: str):
389
+ """1つのモデルをフォルダにまとめてダウンロード→暗号化→アップロード"""
390
  try:
 
391
  model_id = model_url.rstrip("/").split("/")[-1]
 
 
392
  model_info = self.get_model_info(model_id)
393
  if not model_info or "modelVersions" not in model_info:
394
  logger.error(f"No valid model info for ID {model_id}. Skipping.")
395
  return
396
 
 
397
  versions = model_info["modelVersions"]
398
  if not versions:
399
  logger.warning(f"No modelVersions found for ID {model_id}.")
400
  return
401
 
402
+ folder_name = model_info.get("name", "UnnamedModel")
 
403
  folder_name = re.sub(r'[\\/*?:"<>|]', '_', folder_name)
404
  folder_name += "_" + str(uuid.uuid4())[:8]
405
  os.makedirs(folder_name, exist_ok=True)
406
 
407
+ # 最新 + 古いバージョンすべて folder_name 下へ
 
 
408
  self.download_and_process_versions(versions, folder_name)
409
  self.download_images(versions, folder_name)
410
  self.save_html_content(f"{self.config.URLS['modelPage']}{model_id}", folder_name)
411
  self.save_model_info_json(model_info, folder_name)
412
 
413
+ # フォルダごと暗号化
 
 
 
414
  enc_subfolder = self.encrypt_and_upload_folder(folder_name)
415
  if enc_subfolder is None:
416
  enc_subfolder = "[ENCRYPT_FAILED]"
 
423
  logger.error(f"Error in process_model ({model_url}): {e}")
424
 
425
  async def crawl(self):
 
426
  while True:
427
  try:
428
  login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
429
 
430
+ # model_list.logの取得
431
  model_list_path = hf_hub_download(
432
  repo_id=self.repo_ids['model_list'],
433
  filename=self.config.LIST_FILE
434
  )
435
  shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
436
 
437
+ # ログファイルの取得
438
  local_file_path = hf_hub_download(
439
  repo_id=self.repo_ids["log"],
440
  filename=self.config.LOG_FILE
 
469
  else:
470
  await asyncio.sleep(2)
471
 
 
472
  old_models.append(model_id)
473
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
474
  f.write(json.dumps(old_models) + "\n")
475
  f.write(f"{self.repo_ids['current']}\n")
476
  logger.info(f"Updated log with new model ID: {model_id}")
477
 
 
478
  self.upload_file(
479
  file_path=self.config.LOG_FILE,
480
  repo_id=self.repo_ids["log"],
 
486
  path_in_repo=self.config.LIST_FILE
487
  )
488
  else:
 
489
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
490
  f.write(json.dumps(latest_model_ids) + "\n")
491
  f.write(f"{self.repo_ids['current']}\n")