ttttdiva commited on
Commit
fc1d399
·
verified ·
1 Parent(s): a7e29db

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +246 -313
main.py CHANGED
@@ -16,7 +16,6 @@ from fake_useragent import UserAgent
16
  from fastapi import FastAPI
17
  from huggingface_hub import HfApi, create_repo, hf_hub_download, login
18
 
19
- # ロギングの設定
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
22
 
@@ -49,7 +48,6 @@ class Config:
49
 
50
  # ===== rclone 用の追加設定 =====
51
  RCLONE_CONF_BASE64 = os.environ.get("RCLONE_CONF_BASE64", "")
52
- # 暗号化されたファイルが出力されるローカルディレクトリ(cryptLocal: の実体)
53
  ENCRYPTED_DIR = "/home/user/app/encrypted"
54
 
55
 
@@ -63,61 +61,50 @@ class CivitAICrawler:
63
  self.repo_ids = self.config.REPO_IDS.copy()
64
  self.jst = self.config.JST
65
 
66
- # rclone のセットアップ
67
  self.setup_rclone_conf()
68
 
69
  self.setup_routes()
70
 
71
  def setup_routes(self):
72
- """FastAPIのルーティングを設定する。"""
73
  @self.app.get("/")
74
  def read_root():
75
  now = str(datetime.datetime.now(self.jst))
76
- description = f"""
77
- CivitAIを定期的に周回し新規モデルを {self.repo_ids['current']} にバックアップするSpaceです。
78
- model_list.log や civitai_backup.log は暗号化しないでアップロードします。
79
- モデルのフォルダやファイルは暗号化してアップロードします。
80
- Status: {now} + currently running :D
81
- """
82
  return description
83
 
84
  @self.app.on_event("startup")
85
  async def startup_event():
86
  asyncio.create_task(self.crawl())
87
 
88
- # =============================================================================
89
- # rclone の設定・暗号化アップロード処理
90
- # =============================================================================
91
  def setup_rclone_conf(self):
92
- """環境変数 RCLONE_CONF_BASE64 から rclone.conf を生成し、RCLONE_CONFIG 環境変数を設定"""
93
  if not self.config.RCLONE_CONF_BASE64:
94
  logger.warning("[WARN] RCLONE_CONF_BASE64 is empty. rclone may fail.")
95
  return
96
-
97
  os.makedirs(".rclone_config", exist_ok=True)
98
  conf_path = os.path.join(".rclone_config", "rclone.conf")
99
  with open(conf_path, "wb") as f:
100
  f.write(base64.b64decode(self.config.RCLONE_CONF_BASE64))
101
-
102
  os.environ["RCLONE_CONFIG"] = conf_path
103
  logger.info(f"[INFO] rclone.conf created at: {conf_path}")
104
 
105
  def encrypt_with_rclone(self, local_path: str):
106
- """
107
- 指定ファイル or ディレクトリを cryptLocal: にコピー。
108
- フォルダ構造やファイル名を rclone の filename_encryption 設定に応じて暗号化する。
109
- """
110
  if not os.path.exists(local_path):
111
  raise FileNotFoundError(f"[ERROR] Local path not found: {local_path}")
112
-
113
  # 事前に暗号先ディレクトリをクリーンアップ
114
  if os.path.isdir(self.config.ENCRYPTED_DIR):
115
  shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
116
 
117
- top_level_name = os.path.basename(local_path.rstrip("/"))
118
- if not top_level_name:
119
- top_level_name = "unnamed"
120
-
121
  cmd = ["rclone", "copy", local_path, f"cryptLocal:{top_level_name}", "-v"]
122
  logger.info(f"[INFO] Running: {' '.join(cmd)}")
123
  subprocess.run(cmd, check=True)
@@ -128,15 +115,14 @@ class CivitAICrawler:
128
  f"[ERROR] {self.config.ENCRYPTED_DIR} not found. Check your rclone config."
129
  )
130
 
131
- # 例: upload_encrypted_files の中の再試行処理
132
  def upload_encrypted_files(self, repo_id: str, base_path_in_repo: str = ""):
 
133
  max_retries = 5
134
  for root, dirs, files in os.walk(self.config.ENCRYPTED_DIR):
135
  for fn in files:
136
  encrypted_file_path = os.path.join(root, fn)
137
  if not os.path.isfile(encrypted_file_path):
138
  continue
139
-
140
  relative_path = os.path.relpath(encrypted_file_path, self.config.ENCRYPTED_DIR)
141
  upload_path_in_repo = os.path.join(base_path_in_repo, relative_path)
142
 
@@ -150,37 +136,26 @@ class CivitAICrawler:
150
  )
151
  logger.info(f"[OK] Uploaded => {repo_id}/{upload_path_in_repo}")
152
  break
153
-
154
  except Exception as e:
155
  attempt += 1
156
  error_message = str(e)
157
-
158
- # ================================
159
- # 429によるrate-limit検出追加
160
- # ================================
161
- # "You have been rate-limited; you can retry this action in 31 minutes."
162
- # のようなメッセージから時間を抽出し、その時間+1分だけ待機後、再試行
163
  if "rate-limited" in error_message and "minutes" in error_message:
164
  import re
165
  match = re.search(r"in (\d+) minutes?", error_message)
166
  if match:
167
- minutes = int(match.group(1))
168
- # +1分して待機
169
- minutes += 1
170
- logger.warning(f"Rate-limited. Waiting {minutes} minutes before retry...")
171
  time.sleep(minutes * 60)
172
- attempt -= 1 # 同じ attempt カウントで再試行
173
  continue
174
-
175
- # ================================
176
- # すでにある1時間待機処理
177
- # ================================
178
  if "you can retry this action in about 1 hour" in error_message:
179
- logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour before retrying...")
180
  time.sleep(3600)
181
- attempt -= 1 # 再試行回数を増やさずにループを続ける
182
  continue
183
-
184
  if "over the limit of 100000 files" in error_message:
185
  logger.warning("Repository file limit exceeded. Creating a new repository...")
186
  self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
@@ -189,7 +164,6 @@ class CivitAICrawler:
189
  repo_id = self.repo_ids['current']
190
  continue
191
 
192
- # 上記以外のエラーの場合
193
  if attempt < max_retries:
194
  logger.warning(
195
  f"Failed to upload {encrypted_file_path}, retry {attempt}/{max_retries}..."
@@ -200,148 +174,6 @@ class CivitAICrawler:
200
  )
201
  raise
202
 
203
- @staticmethod
204
- def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
205
- if content_disposition:
206
- parts = content_disposition.split(';')
207
- for part in parts:
208
- if "filename=" in part:
209
- return part.split("=")[1].strip().strip('"')
210
- return default_name
211
-
212
- def download_file(self, url: str, destination_folder: str, default_name: str):
213
- try:
214
- response = requests.get(url, headers=self.config.HEADERS, stream=True)
215
- response.raise_for_status()
216
- except requests.RequestException as e:
217
- logger.error(f"Failed to download file from {url}: {e}")
218
- return
219
-
220
- filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name)
221
- file_path = os.path.join(destination_folder, filename)
222
-
223
- with open(file_path, 'wb') as file:
224
- for chunk in response.iter_content(chunk_size=8192):
225
- file.write(chunk)
226
- logger.info(f"Download completed: {file_path}")
227
-
228
- def get_model_info(self, model_id: str) -> dict:
229
- try:
230
- response = requests.get(self.config.URLS["modelId"] + str(model_id), headers=self.config.HEADERS)
231
- response.raise_for_status()
232
- return response.json()
233
- except requests.RequestException as e:
234
- logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
235
-
236
- def download_model(self, model_versions: list, folder: str, existing_old_version_files: list = []):
237
- latest_version = model_versions[0]
238
- latest_files = latest_version["files"]
239
- for file_info in latest_files:
240
- download_url = file_info["downloadUrl"]
241
- file_name = file_info["name"]
242
- login_detected_count = 0
243
-
244
- while login_detected_count < 5:
245
- try:
246
- self.download_file(download_url, folder, file_name)
247
- except Exception as e:
248
- logger.error(f"Exception occurred while downloading {file_name}: {e}")
249
- login_detected_count += 1
250
- continue
251
-
252
- if "login" in os.listdir(folder):
253
- login_detected_count += 1
254
- logger.warning(f"'login' file found. Will try again. ({login_detected_count}/5)")
255
- os.remove(os.path.join(folder, "login"))
256
- else:
257
- logger.info(f"Successfully downloaded {file_name}")
258
- break
259
-
260
- if login_detected_count >= 5:
261
- dummy_file_name = f"{file_name}.download_failed"
262
- dummy_file_path = os.path.join(folder, dummy_file_name)
263
- try:
264
- with open(dummy_file_path, "w") as f:
265
- f.write("Download failed after 5 attempts.")
266
- logger.error(f"Failed to download {file_name}. Created dummy file {dummy_file_name}. URL: {download_url}")
267
- except Exception as e:
268
- logger.error(f"Failed to create dummy file for {file_name}: {e}")
269
-
270
- # 古いバージョンのダウンロード
271
- if len(model_versions) > 1:
272
- old_versions_folder = os.path.join(folder, "old_versions")
273
- os.makedirs(old_versions_folder, exist_ok=True)
274
- for version in model_versions[1:]:
275
- for file_info in version["files"]:
276
- file_name = file_info["name"]
277
- if file_name in existing_old_version_files:
278
- logger.info(f"Skipping download of existing old version file: {file_name}")
279
- continue
280
- download_url = file_info["downloadUrl"]
281
- local_file_path = os.path.join(old_versions_folder, file_name)
282
- login_detected_count = 0
283
-
284
- while login_detected_count < 5:
285
- try:
286
- self.download_file(download_url, old_versions_folder, file_name)
287
- except Exception as e:
288
- logger.error(f"Exception occurred while downloading {file_name}: {e}")
289
- login_detected_count += 1
290
- continue
291
-
292
- if "login" in os.listdir(old_versions_folder):
293
- login_detected_count += 1
294
- logger.warning(f"'login' file found while downloading {file_name}. Will try again. ({login_detected_count}/5)")
295
- os.remove(os.path.join(old_versions_folder, "login"))
296
- else:
297
- logger.info(f"Successfully downloaded {file_name}")
298
- break
299
-
300
- if login_detected_count >= 5:
301
- dummy_file_name = f"{file_name}.download_failed"
302
- dummy_file_path = os.path.join(old_versions_folder, dummy_file_name)
303
- try:
304
- with open(dummy_file_path, "w") as f:
305
- f.write("Download failed after 5 attempts.")
306
- logger.error(f"Failed to download {file_name}. Created dummy file {dummy_file_name}. URL: {download_url}")
307
- except Exception as e:
308
- logger.error(f"Failed to create dummy file for {file_name}: {e}")
309
-
310
- def download_images(self, model_versions: list, folder: str):
311
- images_folder = os.path.join(folder, "images")
312
- os.makedirs(images_folder, exist_ok=True)
313
-
314
- images = []
315
- for version in model_versions:
316
- for img in version.get("images", []):
317
- image_url = img["url"]
318
- images.append(image_url)
319
-
320
- for image_url in images:
321
- image_name = image_url.split("/")[-1]
322
- try:
323
- response = requests.get(image_url)
324
- response.raise_for_status()
325
- with open(os.path.join(images_folder, f"{image_name}.png"), "wb") as file:
326
- file.write(response.content)
327
- except requests.RequestException as e:
328
- logger.error(f"Error downloading image {image_url}: {e}")
329
-
330
- def save_html_content(self, url: str, folder: str):
331
- try:
332
- response = requests.get(url)
333
- response.raise_for_status()
334
- html_path = os.path.join(folder, f"{folder}.html")
335
- with open(html_path, 'w', encoding='utf-8') as file:
336
- file.write(response.text)
337
- except Exception as e:
338
- logger.error(f"Error saving HTML content for URL {url}: {e}")
339
-
340
- @staticmethod
341
- def save_model_info(model_info: dict, folder: str):
342
- with open(os.path.join(folder, "model_info.json"), "w") as file:
343
- json.dump(model_info, file, indent=2)
344
-
345
  @staticmethod
346
  def increment_repo_name(repo_id: str) -> str:
347
  match = re.search(r'(\d+)$', repo_id)
@@ -351,15 +183,38 @@ class CivitAICrawler:
351
  else:
352
  return f"{repo_id}1"
353
 
354
- # =============================================================================
355
- # 暗号化しないアップロード(ログや model_list.log 用)
356
- # =============================================================================
357
- def upload_file_raw(
358
  self,
359
- file_path: str,
360
  repo_id: Optional[str] = None,
361
- path_in_repo: Optional[str] = None
362
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  if repo_id is None:
364
  repo_id = self.repo_ids['current']
365
  if path_in_repo is None:
@@ -387,7 +242,7 @@ class CivitAICrawler:
387
  repo_id = self.repo_ids['current']
388
  continue
389
  elif "you can retry this action in about 1 hour" in error_message:
390
- logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour before retrying...")
391
  time.sleep(3600)
392
  attempt -= 1
393
  else:
@@ -397,67 +252,150 @@ class CivitAICrawler:
397
  logger.error(f"Failed to upload raw file after {max_retries} attempts: {file_path}")
398
  raise
399
 
400
- # =============================================================================
401
- # 暗号化してアップロード (単ファイル)
402
- # =============================================================================
403
- def upload_file_encrypted(
404
- self,
405
- file_path: str,
406
- repo_id: Optional[str] = None,
407
- path_in_repo: Optional[str] = None
408
- ):
409
- if repo_id is None:
410
- repo_id = self.repo_ids['current']
411
- base_path = path_in_repo or ""
412
 
413
- self.encrypt_with_rclone(file_path)
414
- self.upload_encrypted_files(repo_id=repo_id, base_path_in_repo=base_path)
 
 
 
 
 
415
 
416
- if os.path.isdir(self.config.ENCRYPTED_DIR):
417
- shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
418
 
419
- # =============================================================================
420
- # 暗号化してアップロード (フォルダ)
421
- # =============================================================================
422
- def upload_folder_encrypted(
423
- self,
424
- folder_path: str,
425
- repo_id: Optional[str] = None,
426
- path_in_repo: Optional[str] = None
427
- ) -> str:
428
- if repo_id is None:
429
- repo_id = self.repo_ids['current']
430
- base_path = path_in_repo or ""
431
 
432
- self.encrypt_with_rclone(folder_path)
 
433
 
434
- top_levels = [
435
- d for d in os.listdir(self.config.ENCRYPTED_DIR)
436
- if os.path.isdir(os.path.join(self.config.ENCRYPTED_DIR, d))
437
- ]
438
- if not top_levels:
439
- raise RuntimeError("No top-level folder found after rclone encryption.")
440
- if len(top_levels) > 1:
441
- logger.warning(f"Multiple top-level folders found after encryption? {top_levels}. Using the first one.")
 
 
 
 
442
 
443
- encrypted_top_name = top_levels[0]
 
 
 
 
444
 
445
- self.upload_encrypted_files(repo_id=repo_id, base_path_in_repo=base_path)
 
 
 
446
 
447
- if os.path.isdir(self.config.ENCRYPTED_DIR):
448
- shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
 
 
 
 
 
 
 
449
 
450
- return encrypted_top_name
 
 
451
 
452
- # =============================================================================
453
- # model_list.log の読み書きを「model_id: model_hf_url」で扱うよう変更
454
- # =============================================================================
455
- def read_model_list(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  """
457
- model_list.log の各行を
458
- "123456: https://huggingface.co/...encrypted_folder_name"
459
- の形式で読み込み、 { "123456": "https://huggingface.co/..."} の dict を返す
460
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  model_list = {}
462
  try:
463
  with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
@@ -469,95 +407,103 @@ class CivitAICrawler:
469
  if len(parts) == 2:
470
  stored_id, stored_url = parts
471
  model_list[stored_id] = stored_url
472
- return model_list
473
  except Exception as e:
474
  logger.error(f"Failed to read model list: {e}")
475
- return {}
476
 
477
  def process_model(self, model_url: str):
478
- """指定されたモデルURLを処理する関数。"""
479
  try:
480
  model_id = model_url.rstrip("/").split("/")[-1]
481
  model_info = self.get_model_info(model_id)
482
-
 
 
 
483
  latest_version = model_info.get("modelVersions", [])[0]
484
  model_file = next(
485
- (file for file in latest_version["files"] if file.get('type') == 'Model'),
486
  None
487
  )
488
  if model_file:
489
- latest_filename = model_file['name']
490
  folder = os.path.splitext(latest_filename)[0]
491
  else:
 
492
  first_file = latest_version["files"][0]
493
- latest_filename = first_file['name']
494
  folder = os.path.splitext(latest_filename)[0]
495
- logger.warning(f"No 'Model' type file found for model ID {model_id}. Using first file's name.")
496
-
 
497
  os.makedirs(folder, exist_ok=True)
498
-
499
- # model_list を読み込み
500
  model_list = self.read_model_list()
501
-
502
- # もし既に「同名(モデルページ名)がアップされている」かどうか確認したい場合の例:
503
- # ※ 今回は modelpage_name���= model_info["name"]) をキーにするか、
504
- # あるいは model_id (str) をキーにするか、運用に合わせて設定してください。
505
- # 例として modelpage_name をキーとしてチェックする流れ:
506
- modelpage_name = model_info.get("name", "Unnamed Model")
507
-
508
  if modelpage_name in model_list.values():
509
- # 既に同モデルページ名がアップロード済み ここでスキップや上書きなどの処理を決定
510
- logger.info(f"Model '{modelpage_name}' is already listed in model_list. Skipping re-upload.")
511
- # もし「強制再アップロード」したくないなら return で処理終了:
512
- # return
513
- # あるいは「強制アップするがバージョンだけ追加」などいろいろ処理が可能
514
- # ここではあえて続行するが、必要に応じて書き換えてください。
515
-
516
- # ダウンロードや画像保存
517
- existing_old_version_files = []
518
- self.download_model(model_info["modelVersions"], folder, existing_old_version_files)
519
- self.download_images(model_info["modelVersions"], folder)
520
- self.save_html_content(model_url, folder)
521
- self.save_model_info(model_info, folder)
522
-
523
- # ========== rclone で暗号化フォルダをアップロード ==========
524
- encrypted_top_name = self.upload_folder_encrypted(folder)
525
-
526
- # 今回アップロードしたモデルの URL
527
- model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{encrypted_top_name}"
528
-
529
- # model_list.log に追記 → "modelpage_name: model_hf_url" 形式
530
- with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
531
- f.write(f"{modelpage_name}: {model_hf_url}\n")
532
-
533
- # ローカルフォルダ削除
 
 
534
  if os.path.exists(folder):
535
  shutil.rmtree(folder)
536
-
 
 
 
 
 
 
 
 
537
  except Exception as e:
538
  logger.error(f"Unexpected error processing model ({model_url}): {e}")
539
 
540
 
541
  async def crawl(self):
542
- """モデルを定期的にチェックし、更新を行う。"""
543
  while True:
544
  try:
545
  login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
546
 
547
- # model_list.log & civitai_backup.log を取得
548
  model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE)
549
  shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
550
 
551
  local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
552
  shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
553
 
554
- # ログ読み込み
555
  with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
556
  lines = file.read().splitlines()
557
  old_models = json.loads(lines[0]) if len(lines) > 0 else []
558
  self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
559
 
560
- # 新着モデル確認
561
  response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
562
  response.raise_for_status()
563
  latest_models = response.json().get("items", [])
@@ -570,6 +516,7 @@ class CivitAICrawler:
570
  logger.info(f"New models found: {new_models}")
571
  model_id = new_models[0]
572
 
 
573
  for attempt in range(1, 6):
574
  try:
575
  self.process_model(f"{self.config.URLS['modelId']}{model_id}")
@@ -581,43 +528,29 @@ class CivitAICrawler:
581
  else:
582
  await asyncio.sleep(2)
583
  else:
584
- # 新モデルなし
585
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
586
  f.write(json.dumps(latest_model_ids) + "\n")
587
  f.write(f"{self.repo_ids['current']}\n")
588
  logger.info(f"Updated log file: {self.config.LOG_FILE}")
589
 
590
- self.upload_file_raw(
591
- file_path=self.config.LOG_FILE,
592
- repo_id=self.repo_ids["log"],
593
- path_in_repo=self.config.LOG_FILE
594
- )
595
  logger.info("Uploaded log file to repository (unencrypted).")
596
 
597
  logger.info("No new models found.")
598
  await asyncio.sleep(60)
599
  continue
600
 
601
- # 追加したモデルIDを old_models に追加
602
  old_models.append(model_id)
603
-
604
- # ログファイル更新
605
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
606
  f.write(json.dumps(old_models) + "\n")
607
  f.write(f"{self.repo_ids['current']}\n")
608
  logger.info(f"Updated log file with new model ID: {model_id}")
609
 
610
- # ログとmodel_list.logをアップロード
611
- self.upload_file_raw(
612
- file_path=self.config.LOG_FILE,
613
- repo_id=self.repo_ids["log"],
614
- path_in_repo=self.config.LOG_FILE
615
- )
616
- self.upload_file_raw(
617
- file_path=self.config.LIST_FILE,
618
- repo_id=self.repo_ids["model_list"],
619
- path_in_repo=self.config.LIST_FILE
620
- )
621
 
622
  except Exception as e:
623
  logger.error(f"Error during crawling: {e}")
 
16
  from fastapi import FastAPI
17
  from huggingface_hub import HfApi, create_repo, hf_hub_download, login
18
 
 
19
  logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
21
 
 
48
 
49
  # ===== rclone 用の追加設定 =====
50
  RCLONE_CONF_BASE64 = os.environ.get("RCLONE_CONF_BASE64", "")
 
51
  ENCRYPTED_DIR = "/home/user/app/encrypted"
52
 
53
 
 
61
  self.repo_ids = self.config.REPO_IDS.copy()
62
  self.jst = self.config.JST
63
 
64
+ # rcloneのセットアップ
65
  self.setup_rclone_conf()
66
 
67
  self.setup_routes()
68
 
69
  def setup_routes(self):
 
70
  @self.app.get("/")
71
  def read_root():
72
  now = str(datetime.datetime.now(self.jst))
73
+ description = (
74
+ f"CivitAIを定期的に周回し新規モデルを {self.repo_ids['current']} にバックアップするSpaceです。\n"
75
+ f"model_list.log や civitai_backup.log は暗号化しないでアップロードします。\n"
76
+ f"モデルのフォルダやファイルは暗号化してアップロードします。\n"
77
+ f"Status: {now} + currently running :D\n"
78
+ )
79
  return description
80
 
81
  @self.app.on_event("startup")
82
  async def startup_event():
83
  asyncio.create_task(self.crawl())
84
 
85
+ # ============================================================================
86
+ # rclone設定 & 暗号化アップロード関連
87
+ # ============================================================================
88
  def setup_rclone_conf(self):
 
89
  if not self.config.RCLONE_CONF_BASE64:
90
  logger.warning("[WARN] RCLONE_CONF_BASE64 is empty. rclone may fail.")
91
  return
 
92
  os.makedirs(".rclone_config", exist_ok=True)
93
  conf_path = os.path.join(".rclone_config", "rclone.conf")
94
  with open(conf_path, "wb") as f:
95
  f.write(base64.b64decode(self.config.RCLONE_CONF_BASE64))
 
96
  os.environ["RCLONE_CONFIG"] = conf_path
97
  logger.info(f"[INFO] rclone.conf created at: {conf_path}")
98
 
99
  def encrypt_with_rclone(self, local_path: str):
100
+ """単一ファイル or ディレクトリを cryptLocal: にコピーし、暗号化する"""
 
 
 
101
  if not os.path.exists(local_path):
102
  raise FileNotFoundError(f"[ERROR] Local path not found: {local_path}")
 
103
  # 事前に暗号先ディレクトリをクリーンアップ
104
  if os.path.isdir(self.config.ENCRYPTED_DIR):
105
  shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
106
 
107
+ top_level_name = os.path.basename(local_path.rstrip("/")) or "unnamed"
 
 
 
108
  cmd = ["rclone", "copy", local_path, f"cryptLocal:{top_level_name}", "-v"]
109
  logger.info(f"[INFO] Running: {' '.join(cmd)}")
110
  subprocess.run(cmd, check=True)
 
115
  f"[ERROR] {self.config.ENCRYPTED_DIR} not found. Check your rclone config."
116
  )
117
 
 
118
  def upload_encrypted_files(self, repo_id: str, base_path_in_repo: str = ""):
119
+ """self.config.ENCRYPTED_DIR 配下の暗号化済ファイルを再帰的にアップロード"""
120
  max_retries = 5
121
  for root, dirs, files in os.walk(self.config.ENCRYPTED_DIR):
122
  for fn in files:
123
  encrypted_file_path = os.path.join(root, fn)
124
  if not os.path.isfile(encrypted_file_path):
125
  continue
 
126
  relative_path = os.path.relpath(encrypted_file_path, self.config.ENCRYPTED_DIR)
127
  upload_path_in_repo = os.path.join(base_path_in_repo, relative_path)
128
 
 
136
  )
137
  logger.info(f"[OK] Uploaded => {repo_id}/{upload_path_in_repo}")
138
  break
 
139
  except Exception as e:
140
  attempt += 1
141
  error_message = str(e)
142
+ # 429 Rate-limit (31 minutes)
 
 
 
 
 
143
  if "rate-limited" in error_message and "minutes" in error_message:
144
  import re
145
  match = re.search(r"in (\d+) minutes?", error_message)
146
  if match:
147
+ minutes = int(match.group(1)) + 1
148
+ logger.warning(f"Rate-limited. Waiting {minutes} minutes...")
 
 
149
  time.sleep(minutes * 60)
150
+ attempt -= 1
151
  continue
152
+ # 1時間待機パターン
 
 
 
153
  if "you can retry this action in about 1 hour" in error_message:
154
+ logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour...")
155
  time.sleep(3600)
156
+ attempt -= 1
157
  continue
158
+ # 100kファイル上限
159
  if "over the limit of 100000 files" in error_message:
160
  logger.warning("Repository file limit exceeded. Creating a new repository...")
161
  self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
 
164
  repo_id = self.repo_ids['current']
165
  continue
166
 
 
167
  if attempt < max_retries:
168
  logger.warning(
169
  f"Failed to upload {encrypted_file_path}, retry {attempt}/{max_retries}..."
 
174
  )
175
  raise
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  @staticmethod
178
  def increment_repo_name(repo_id: str) -> str:
179
  match = re.search(r'(\d+)$', repo_id)
 
183
  else:
184
  return f"{repo_id}1"
185
 
186
+ # ============================================================================
187
+ # 単ファイル暗号化アップロード ローカル削除
188
+ # ============================================================================
189
+ def upload_file_encrypted_one_by_one(
190
  self,
191
+ local_path: str,
192
  repo_id: Optional[str] = None,
193
+ path_in_repo: str = ""
194
  ):
195
+ """
196
+ 単一ファイル (or フォルダ) を暗号化してアップロードしたあと、ローカルファイルを削除する。
197
+ """
198
+ if not repo_id:
199
+ repo_id = self.repo_ids['current']
200
+
201
+ self.encrypt_with_rclone(local_path)
202
+ self.upload_encrypted_files(repo_id=repo_id, base_path_in_repo=path_in_repo)
203
+
204
+ # 暗号化用のENCRYPTED_DIRを消す
205
+ if os.path.isdir(self.config.ENCRYPTED_DIR):
206
+ shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
207
+
208
+ # 元のローカルファイルも削除
209
+ if os.path.isfile(local_path):
210
+ os.remove(local_path)
211
+ elif os.path.isdir(local_path):
212
+ shutil.rmtree(local_path, ignore_errors=True)
213
+
214
+ # ============================================================================
215
+ # 生ファイルアップロード (ログなど)
216
+ # ============================================================================
217
+ def upload_file_raw(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
218
  if repo_id is None:
219
  repo_id = self.repo_ids['current']
220
  if path_in_repo is None:
 
242
  repo_id = self.repo_ids['current']
243
  continue
244
  elif "you can retry this action in about 1 hour" in error_message:
245
+ logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour...")
246
  time.sleep(3600)
247
  attempt -= 1
248
  else:
 
252
  logger.error(f"Failed to upload raw file after {max_retries} attempts: {file_path}")
253
  raise
254
 
255
+ # ============================================================================
256
+ # ダウンロード関連
257
+ # ============================================================================
258
+ @staticmethod
259
+ def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
260
+ if content_disposition:
261
+ parts = content_disposition.split(';')
262
+ for part in parts:
263
+ if "filename=" in part:
264
+ return part.split("=")[1].strip().strip('"')
265
+ return default_name
 
266
 
267
+ def download_file(self, url: str, destination_folder: str, default_name: str):
268
+ try:
269
+ response = requests.get(url, headers=self.config.HEADERS, stream=True)
270
+ response.raise_for_status()
271
+ except requests.RequestException as e:
272
+ logger.error(f"Failed to download file from {url}: {e}")
273
+ return None
274
 
275
+ filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name)
276
+ file_path = os.path.join(destination_folder, filename)
277
 
278
+ with open(file_path, 'wb') as file:
279
+ for chunk in response.iter_content(chunk_size=8192):
280
+ file.write(chunk)
 
 
 
 
 
 
 
 
 
281
 
282
+ logger.info(f"Download completed: {file_path}")
283
+ return file_path # ★ ダウンロードしたファイルのパスを返すように
284
 
285
+ # ============================================================================
286
+ # (★修正)1ファイルずつDL→暗号化→アップロード→削除
287
+ # ============================================================================
288
+ def process_latest_files_one_by_one(self, version_data: dict, model_folder: str, encrypted_folder_name: str):
289
+ """
290
+ 最新バージョンのファイルを1つずつダウンロード→暗号化アップロード→ローカル削除
291
+ path_in_repo "{encrypted_folder_name}/" をベースに。
292
+ """
293
+ files = version_data.get("files", [])
294
+ for file_info in files:
295
+ download_url = file_info["downloadUrl"]
296
+ file_name = file_info["name"]
297
 
298
+ # ダウンロード
299
+ local_path = self.download_file(download_url, model_folder, file_name)
300
+ if not local_path or not os.path.exists(local_path):
301
+ logger.warning(f"Skip because file not found locally: {local_path}")
302
+ continue
303
 
304
+ # 暗号化アップロード
305
+ # 例: "myModelName/filename"
306
+ in_repo_path = os.path.join(encrypted_folder_name, file_name)
307
+ self.upload_file_encrypted_one_by_one(local_path, repo_id=self.repo_ids['current'], path_in_repo=in_repo_path)
308
 
309
+ def process_images_one_by_one(self, version_list: list, model_folder: str, encrypted_folder_name: str):
310
+ """
311
+ 画像をすべて1つずつDL→暗号化アップロード→削除
312
+ path_in_repo は "{encrypted_folder_name}/images/"
313
+ """
314
+ images = []
315
+ for version in version_list:
316
+ for img_info in version.get("images", []):
317
+ images.append(img_info["url"])
318
 
319
+ for image_url in images:
320
+ image_name = image_url.split("/")[-1] + ".png"
321
+ local_path = os.path.join(model_folder, image_name)
322
 
323
+ # ダウンロード
324
+ try:
325
+ resp = requests.get(image_url, stream=True)
326
+ resp.raise_for_status()
327
+ with open(local_path, "wb") as f:
328
+ for chunk in resp.iter_content(chunk_size=8192):
329
+ f.write(chunk)
330
+ logger.info(f"Downloaded image: {local_path}")
331
+ except Exception as e:
332
+ logger.error(f"Error downloading image {image_url}: {e}")
333
+ continue
334
+
335
+ # アップロード
336
+ in_repo_path = os.path.join(encrypted_folder_name, "images", image_name)
337
+ self.upload_file_encrypted_one_by_one(local_path, self.repo_ids['current'], in_repo_path)
338
+
339
+ def process_old_versions_one_by_one(self, version_list: list, model_folder: str, encrypted_folder_name: str):
340
  """
341
+ 古いバージョン (index=1以降) のファイルを 1つずつダウンロード→暗号化アップロード→削除
342
+ path_in_repo は "{encrypted_folder_name}/old_versions/{versionID_orName}/filename"
 
343
  """
344
+ if len(version_list) <= 1:
345
+ return
346
+
347
+ for old_version in version_list[1:]:
348
+ # どんな名前でフォルダを区別するか(バージョンIDやバージョン名など)
349
+ version_id_or_name = str(old_version.get("id", "old_ver"))
350
+ files = old_version.get("files", [])
351
+ for file_info in files:
352
+ download_url = file_info["downloadUrl"]
353
+ file_name = file_info["name"]
354
+
355
+ # ダウンロード
356
+ local_path = self.download_file(download_url, model_folder, file_name)
357
+ if not local_path or not os.path.exists(local_path):
358
+ logger.warning(f"Skip because file not found locally: {local_path}")
359
+ continue
360
+
361
+ # 暗号化アップロード
362
+ in_repo_path = os.path.join(
363
+ encrypted_folder_name,
364
+ "old_versions",
365
+ version_id_or_name,
366
+ file_name
367
+ )
368
+ self.upload_file_encrypted_one_by_one(local_path, self.repo_ids['current'], in_repo_path)
369
+
370
+ # ============================================================================
371
+ # HTML & model_info.json は軽量なので一括DL→アップロードでもOK
372
+ # ============================================================================
373
+ def save_html_content(self, url: str, folder: str):
374
+ try:
375
+ response = requests.get(url)
376
+ response.raise_for_status()
377
+ html_path = os.path.join(folder, os.path.basename(folder) + ".html")
378
+ with open(html_path, 'w', encoding='utf-8') as file:
379
+ file.write(response.text)
380
+ return html_path
381
+ except Exception as e:
382
+ logger.error(f"Error saving HTML content for URL {url}: {e}")
383
+ return None
384
+
385
+ def save_model_info(self, model_info: dict, folder: str):
386
+ json_path = os.path.join(folder, "model_info.json")
387
+ try:
388
+ with open(json_path, "w", encoding="utf-8") as file:
389
+ json.dump(model_info, file, indent=2)
390
+ return json_path
391
+ except Exception as e:
392
+ logger.error(f"Error saving model_info.json: {e}")
393
+ return None
394
+
395
+ # ============================================================================
396
+ # model_list.log
397
+ # ============================================================================
398
+ def read_model_list(self):
399
  model_list = {}
400
  try:
401
  with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
 
407
  if len(parts) == 2:
408
  stored_id, stored_url = parts
409
  model_list[stored_id] = stored_url
 
410
  except Exception as e:
411
  logger.error(f"Failed to read model list: {e}")
412
+ return model_list
413
 
414
  def process_model(self, model_url: str):
415
+ """ 指定されたモデルURLを処理 (1つずつファイルをDL→アップロード→削除) """
416
  try:
417
  model_id = model_url.rstrip("/").split("/")[-1]
418
  model_info = self.get_model_info(model_id)
419
+ if not model_info:
420
+ logger.error(f"No model_info returned for {model_id}")
421
+ return
422
+
423
  latest_version = model_info.get("modelVersions", [])[0]
424
  model_file = next(
425
+ (file for file in latest_version.get("files", []) if file.get("type") == "Model"),
426
  None
427
  )
428
  if model_file:
429
+ latest_filename = model_file["name"]
430
  folder = os.path.splitext(latest_filename)[0]
431
  else:
432
+ # ファイルtype=Modelが無い場合、とりあえず最初のファイル名でフォルダ名を作る
433
  first_file = latest_version["files"][0]
434
+ latest_filename = first_file["name"]
435
  folder = os.path.splitext(latest_filename)[0]
436
+ logger.warning(f"No 'Model' type file found for {model_id}. Using first file's name.")
437
+
438
+ # ローカルに一時フォルダを作成
439
  os.makedirs(folder, exist_ok=True)
440
+
441
+ # 同名判定 (model_list.log を読み込み、modelpage_name が既にあればスキップ)
442
  model_list = self.read_model_list()
443
+ modelpage_name = model_info.get("name", f"Model_{model_id}")
 
 
 
 
 
 
444
  if modelpage_name in model_list.values():
445
+ logger.info(f"Model '{modelpage_name}' already in model_list. Skipping.")
446
+ # return # 必要に応じてリターン
447
+
448
+ # HTMLやmodel_info は軽いので一括保存→まとめて単ファイル暗号化アップロード
449
+ html_path = self.save_html_content(self.config.URLS["modelPage"] + str(model_id), folder)
450
+ json_path = self.save_model_info(model_info, folder)
451
+
452
+ # 暗号化アップロード(HTML, JSON など)
453
+ # HF 上では "{folder}/model_info.json" としておく例
454
+ if html_path and os.path.exists(html_path):
455
+ in_repo_path = os.path.join(folder, os.path.basename(html_path))
456
+ self.upload_file_encrypted_one_by_one(html_path, self.repo_ids['current'], in_repo_path)
457
+
458
+ if json_path and os.path.exists(json_path):
459
+ in_repo_path = os.path.join(folder, "model_info.json")
460
+ self.upload_file_encrypted_one_by_one(json_path, self.repo_ids['current'], in_repo_path)
461
+
462
+ # 最新バージョンを1ファイルずつアップロード
463
+ self.process_latest_files_one_by_one(latest_version, folder, folder)
464
+
465
+ # 画像を1ファイルずつアップロード
466
+ self.process_images_one_by_one(model_info["modelVersions"], folder, folder)
467
+
468
+ # 古いバージョンを1ファイルずつアップロード
469
+ self.process_old_versions_one_by_one(model_info["modelVersions"], folder, folder)
470
+
471
+ # ここで folder はほぼ空だが、一応削除
472
  if os.path.exists(folder):
473
  shutil.rmtree(folder)
474
+
475
+ # 最後に model_list.log に追記 (「modelpage_name: HFのURL構造」)
476
+ # 今回はフォルダごと暗号化ではなくファイルごとなので、ひとまず "folder" をルート名に使っておく
477
+ # Hugging Face 上でのトップフォルダ URL:
478
+ # https://huggingface.co/REPO_ID/tree/main/folder
479
+ model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{folder}"
480
+ with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
481
+ f.write(f"{modelpage_name}: {model_hf_url}\n")
482
+
483
  except Exception as e:
484
  logger.error(f"Unexpected error processing model ({model_url}): {e}")
485
 
486
 
487
  async def crawl(self):
488
+ """新着モデルをチェックし、1件ずつ処理するループ"""
489
  while True:
490
  try:
491
  login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
492
 
493
+ # 最新のmodel_list.log & civitai_backup.log をダウンロード
494
  model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE)
495
  shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
496
 
497
  local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
498
  shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
499
 
500
+ # civitai_backup.log を読み取り
501
  with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
502
  lines = file.read().splitlines()
503
  old_models = json.loads(lines[0]) if len(lines) > 0 else []
504
  self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
505
 
506
+ # 新着モデルを確認
507
  response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
508
  response.raise_for_status()
509
  latest_models = response.json().get("items", [])
 
516
  logger.info(f"New models found: {new_models}")
517
  model_id = new_models[0]
518
 
519
+ # 試行5回
520
  for attempt in range(1, 6):
521
  try:
522
  self.process_model(f"{self.config.URLS['modelId']}{model_id}")
 
528
  else:
529
  await asyncio.sleep(2)
530
  else:
531
+ # 新モデルなし → backup.log を更新 & アップロード
532
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
533
  f.write(json.dumps(latest_model_ids) + "\n")
534
  f.write(f"{self.repo_ids['current']}\n")
535
  logger.info(f"Updated log file: {self.config.LOG_FILE}")
536
 
537
+ self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
 
 
 
 
538
  logger.info("Uploaded log file to repository (unencrypted).")
539
 
540
  logger.info("No new models found.")
541
  await asyncio.sleep(60)
542
  continue
543
 
544
+ # 成功したモデルをold_modelsに追加 → backup.log更新
545
  old_models.append(model_id)
 
 
546
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
547
  f.write(json.dumps(old_models) + "\n")
548
  f.write(f"{self.repo_ids['current']}\n")
549
  logger.info(f"Updated log file with new model ID: {model_id}")
550
 
551
+ # ログ & model_list.log をアップ
552
+ self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
553
+ self.upload_file_raw(self.config.LIST_FILE, self.repo_ids["model_list"], self.config.LIST_FILE)
 
 
 
 
 
 
 
 
554
 
555
  except Exception as e:
556
  logger.error(f"Error during crawling: {e}")