ttttdiva commited on
Commit
88c1cd8
·
verified ·
1 Parent(s): f30744e

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +214 -276
main.py CHANGED
@@ -14,14 +14,14 @@ import requests
14
  from bs4 import BeautifulSoup
15
  from fake_useragent import UserAgent
16
  from fastapi import FastAPI
17
- from huggingface_hub import HfApi, create_repo, hf_hub_download, login
18
 
19
  logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
21
 
22
 
23
  class Config:
24
- """設定用のクラス"""
25
  HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"]
26
  CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"]
27
  LOG_FILE = "civitai_backup.log"
@@ -35,24 +35,22 @@ class Config:
35
  "latest": "https://civitai.com/api/v1/models?sort=Newest",
36
  "modelPage": "https://civitai.com/models/",
37
  "modelId": "https://civitai.com/api/v1/models/",
38
- "modelVersionId": "https://civitai.com/api/v1/model-versions/",
39
- "hash": "https://civitai.com/api/v1/model-versions/by-hash/"
40
  }
41
  JST = datetime.timezone(datetime.timedelta(hours=9))
42
  UA = UserAgent()
43
  HEADERS = {
44
- 'Authorization': f'Bearer {CIVITAI_API_TOKEN}',
45
- 'User-Agent': 'civitai-crawler/1.0',
46
  "Content-Type": "application/json"
47
  }
48
 
49
- # rclone 用の追加設定
50
  RCLONE_CONF_BASE64 = os.environ.get("RCLONE_CONF_BASE64", "")
51
  ENCRYPTED_DIR = "/home/user/app/encrypted"
52
 
53
 
54
  class CivitAICrawler:
55
- """CivitAIからモデルをダウンロードし、Hugging Faceにアップロードするクラス"""
56
 
57
  def __init__(self, config: Config):
58
  self.config = config
@@ -61,165 +59,138 @@ class CivitAICrawler:
61
  self.repo_ids = self.config.REPO_IDS.copy()
62
  self.jst = self.config.JST
63
 
64
- # rclone 設定の読み込み
65
  self.setup_rclone_conf()
66
  self.setup_routes()
67
 
68
  def setup_routes(self):
69
  @self.app.get("/")
70
- def read_root():
71
- now = str(datetime.datetime.now(self.jst))
72
- description = (
73
- f"CivitAIを定期的に周回し新規モデルを {self.repo_ids['current']} にバックアップするSpaceです。\n"
74
- f"model_list.log や civitai_backup.log は暗号化しないでアップロードします。\n"
75
- f"モデルのフォルダやファイルは暗号化してアップロードします。\n"
76
- f"Status: {now} + currently running :D\n"
77
- )
78
- return description
79
 
80
  @self.app.on_event("startup")
81
- async def startup_event():
82
  asyncio.create_task(self.crawl())
83
 
84
- # ============================================================================
85
- # rclone 設定 & 暗号化アップロード処理
86
- # ============================================================================
87
  def setup_rclone_conf(self):
88
  if not self.config.RCLONE_CONF_BASE64:
89
- logger.warning("[WARN] RCLONE_CONF_BASE64 is empty. rclone may fail.")
90
  return
91
  os.makedirs(".rclone_config", exist_ok=True)
92
  conf_path = os.path.join(".rclone_config", "rclone.conf")
93
  with open(conf_path, "wb") as f:
94
  f.write(base64.b64decode(self.config.RCLONE_CONF_BASE64))
95
  os.environ["RCLONE_CONFIG"] = conf_path
96
- logger.info(f"[INFO] rclone.conf created at: {conf_path}")
97
 
 
 
 
98
  def encrypt_with_rclone(self, local_path: str):
99
- """フォルダ or ファイルを cryptLocal: にコピーし、フォルダ名・ファイル名を暗号化"""
100
  if not os.path.exists(local_path):
101
- raise FileNotFoundError(f"[ERROR] Local path not found: {local_path}")
102
- # 事前に暗号先ディレクトリを掃除
103
  if os.path.isdir(self.config.ENCRYPTED_DIR):
104
  shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
105
 
106
- top_level_name = os.path.basename(local_path.rstrip("/")) or "unnamed"
107
- cmd = ["rclone", "copy", local_path, f"cryptLocal:{top_level_name}", "-v"]
108
- logger.info(f"[INFO] Running: {' '.join(cmd)}")
 
 
 
 
 
 
109
  subprocess.run(cmd, check=True)
110
- logger.info(f"[OK] rclone copy => cryptLocal:{top_level_name}")
111
 
112
  if not os.path.isdir(self.config.ENCRYPTED_DIR):
113
- raise FileNotFoundError(
114
- f"[ERROR] {self.config.ENCRYPTED_DIR} not found. Check your rclone config."
115
- )
116
 
117
- def upload_encrypted_files(self, repo_id: str, base_path_in_repo: str = ""):
118
- """self.config.ENCRYPTED_DIR 以下の暗号化済ファイルを再帰的にアップロード"""
119
  max_retries = 5
120
  for root, dirs, files in os.walk(self.config.ENCRYPTED_DIR):
121
  for fn in files:
122
- encrypted_file_path = os.path.join(root, fn)
123
- if not os.path.isfile(encrypted_file_path):
124
- continue
125
-
126
- relative_path = os.path.relpath(encrypted_file_path, self.config.ENCRYPTED_DIR)
127
- upload_path_in_repo = os.path.join(base_path_in_repo, relative_path)
128
 
129
  attempt = 0
130
  while attempt < max_retries:
131
  try:
132
  self.api.upload_file(
133
- path_or_fileobj=encrypted_file_path,
134
  repo_id=repo_id,
135
- path_in_repo=upload_path_in_repo
136
  )
137
- logger.info(f"[OK] Uploaded => {repo_id}/{upload_path_in_repo}")
138
  break
139
  except Exception as e:
140
  attempt += 1
141
- error_message = str(e)
142
- # 429 Rate-limit with "in XX minutes"
143
- if "rate-limited" in error_message and "minutes" in error_message:
144
  import re
145
- match = re.search(r"in (\d+) minutes?", error_message)
146
- if match:
147
- minutes = int(match.group(1)) + 1
148
- logger.warning(f"Rate-limited. Waiting {minutes} minutes...")
149
- time.sleep(minutes * 60)
150
  attempt -= 1
151
  continue
152
- # 1時間待機
153
- if "you can retry this action in about 1 hour" in error_message:
154
- logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour...")
155
  time.sleep(3600)
156
  attempt -= 1
157
  continue
158
- # ファイル上限
159
- if "over the limit of 100000 files" in error_message:
160
- logger.warning("Repository file limit exceeded. Creating a new repository...")
161
  self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
162
  self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
163
  attempt = 0
164
  repo_id = self.repo_ids['current']
165
  continue
166
-
167
  if attempt < max_retries:
168
- logger.warning(
169
- f"Failed to upload {encrypted_file_path}, retry {attempt}/{max_retries}..."
170
- )
171
  else:
172
- logger.error(f"Failed to upload after {max_retries} attempts: {encrypted_file_path}")
173
  raise
174
 
175
  def upload_folder_encrypted(self, folder_path: str, repo_id: Optional[str] = None, path_in_repo: str = ""):
176
- """フォルダを丸ごと暗号化してアップロード (=フォルダ名も暗号化)"""
177
  if not repo_id:
178
  repo_id = self.repo_ids['current']
179
-
180
  self.encrypt_with_rclone(folder_path)
181
- self.upload_encrypted_files(repo_id, base_path_in_repo=path_in_repo)
182
-
183
- # 暗号化フォルダを削除
184
  if os.path.isdir(self.config.ENCRYPTED_DIR):
185
  shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
186
 
187
- # ============================================================================
188
- # 単一ファイルを暗号化アップロードしてローカル削除 (old_versions用)
189
- # ============================================================================
190
  def upload_file_encrypted_one_by_one(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: str = ""):
191
- """
192
- 単一ファイルを暗号化アップロードし、アップロード後にローカルファイルを削除。
193
- """
194
  if not repo_id:
195
  repo_id = self.repo_ids['current']
196
 
197
  self.encrypt_with_rclone(file_path)
198
- self.upload_encrypted_files(repo_id, base_path_in_repo=path_in_repo)
199
-
200
- # 暗号化ディレクトリを削除
201
  if os.path.isdir(self.config.ENCRYPTED_DIR):
202
  shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
203
- # ローカルの実ファイル削除
204
  if os.path.exists(file_path):
205
  os.remove(file_path)
206
 
207
  @staticmethod
208
  def increment_repo_name(repo_id: str) -> str:
209
- match = re.search(r'(\d+)$', repo_id)
210
- if match:
211
- number = int(match.group(1)) + 1
212
- return re.sub(r'\d+$', str(number), repo_id)
213
  else:
214
- return f"{repo_id}1"
215
 
216
  # ============================================================================
217
- # ログや model_list.log は生アップロード
218
  # ============================================================================
219
  def upload_file_raw(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
220
- if repo_id is None:
221
  repo_id = self.repo_ids['current']
222
- if path_in_repo is None:
223
  path_in_repo = os.path.basename(file_path)
224
 
225
  max_retries = 5
@@ -231,320 +202,287 @@ class CivitAICrawler:
231
  repo_id=repo_id,
232
  path_in_repo=path_in_repo
233
  )
234
- logger.info(f"[OK] Uploaded {file_path} => {repo_id}/{path_in_repo}")
235
  return
236
  except Exception as e:
237
  attempt += 1
238
- error_message = str(e)
239
- if "over the limit of 100000 files" in error_message:
240
- logger.warning("Repository file limit exceeded, creating a new repository.")
241
  self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
242
  self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
243
  attempt = 0
244
  repo_id = self.repo_ids['current']
245
  continue
246
- elif "you can retry this action in about 1 hour" in error_message:
247
- logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour before retrying...")
248
  time.sleep(3600)
249
  attempt -= 1
250
  else:
251
  if attempt < max_retries:
252
- logger.warning(f"Failed to upload raw file {file_path}, retry {attempt}/{max_retries}...")
253
  else:
254
- logger.error(f"Failed to upload raw file after {max_retries} attempts: {file_path}")
255
  raise
256
 
257
  # ============================================================================
258
- # ダウンロード処理
259
  # ============================================================================
260
  @staticmethod
261
- def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
262
- if content_disposition:
263
- parts = content_disposition.split(';')
264
- for part in parts:
265
- if "filename=" in part:
266
- return part.split("=")[1].strip().strip('"')
267
  return default_name
268
 
269
- def download_file(self, url: str, destination_folder: str, default_name: str):
270
  try:
271
- response = requests.get(url, headers=self.config.HEADERS, stream=True)
272
- response.raise_for_status()
273
  except requests.RequestException as e:
274
- logger.error(f"Failed to download file from {url}: {e}")
275
  return None
276
 
277
- filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name)
278
- file_path = os.path.join(destination_folder, filename)
 
279
 
280
- with open(file_path, 'wb') as file:
281
- for chunk in response.iter_content(chunk_size=8192):
282
- file.write(chunk)
283
 
284
- logger.info(f"Download completed: {file_path}")
285
- return file_path
286
 
287
  # ============================================================================
288
- # 古いバージョンのみ1ファイルずつアップロード
289
  # ============================================================================
290
- def download_old_versions_one_by_one(self, version_list: list, folder: str):
291
- """version_list[1:] を対象に、モデルファイルを 1ファイルDL→upload→削除 を繰り返す"""
292
- if len(version_list) <= 1:
293
  return
294
-
295
- old_versions_folder = os.path.join(folder, "old_versions")
296
- os.makedirs(old_versions_folder, exist_ok=True)
297
-
298
- for version in version_list[1:]:
299
- for file_info in version.get("files", []):
300
- download_url = file_info["downloadUrl"]
301
- file_name = file_info["name"]
302
-
303
- local_path = self.download_file(download_url, old_versions_folder, file_name)
304
- if not local_path or not os.path.exists(local_path):
305
- logger.error(f"Failed to download or file not found: {file_name}")
306
- continue
307
-
308
- # 1つアップロードして削除
309
- # path_in_repo を空文字にすればフォルダ名も暗号化される(トップレベル)
310
- # もしサブフォルダにまとめたいなら "old_versions" とか指定する
311
- self.upload_file_encrypted_one_by_one(local_path, path_in_repo="")
312
-
313
- # old_versions フォルダ内は空になったはずなので削除
314
- if os.path.exists(old_versions_folder):
315
- shutil.rmtree(old_versions_folder, ignore_errors=True)
316
 
317
  # ============================================================================
318
- # 従来どおり「最新バージョンのファイル一式 + images」フォルダを一括DL→アップロード
319
  # ============================================================================
320
- def download_model(self, model_versions: list, folder: str):
321
- """最新バージョンを一括ダウンロード (フォルダにまとめる)"""
322
- latest_version = model_versions[0]
323
- latest_files = latest_version["files"]
324
-
325
- for file_info in latest_files:
326
- download_url = file_info["downloadUrl"]
327
- file_name = file_info["name"]
328
- local_path = self.download_file(download_url, folder, file_name)
329
- if local_path and os.path.exists(local_path):
330
- logger.info(f"Downloaded {file_name}")
331
- else:
332
- logger.warning(f"Could not download {file_name}")
333
 
334
  def download_images(self, model_versions: list, folder: str):
335
  images_folder = os.path.join(folder, "images")
336
  os.makedirs(images_folder, exist_ok=True)
337
 
338
- images = []
339
- for version in model_versions:
340
- for img in version.get("images", []):
341
- images.append(img["url"])
342
-
343
- for image_url in images:
344
- image_name = os.path.basename(image_url) + ".png"
345
- local_path = os.path.join(images_folder, image_name)
346
- try:
347
- resp = requests.get(image_url, stream=True)
348
- resp.raise_for_status()
349
- with open(local_path, "wb") as f:
350
- for chunk in resp.iter_content(chunk_size=8192):
351
- f.write(chunk)
352
- logger.info(f"Downloaded image: {local_path}")
353
- except Exception as e:
354
- logger.error(f"Error downloading image {image_url}: {e}")
355
 
356
  def save_html_content(self, url: str, folder: str):
357
  try:
358
- response = requests.get(url)
359
- response.raise_for_status()
360
- html_path = os.path.join(folder, os.path.basename(folder) + ".html")
361
- with open(html_path, 'w', encoding='utf-8') as file:
362
- file.write(response.text)
 
 
363
  except Exception as e:
364
- logger.error(f"Error saving HTML content for URL {url}: {e}")
365
 
366
- @staticmethod
367
- def save_model_info(model_info: dict, folder: str):
368
- with open(os.path.join(folder, "model_info.json"), "w", encoding="utf-8") as file:
369
- json.dump(model_info, file, indent=2)
 
 
 
 
370
 
371
  # ============================================================================
372
- # model_list.log
373
  # ============================================================================
374
  def read_model_list(self):
375
- model_list = {}
 
 
376
  try:
377
  with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
378
  for line in f:
379
  line = line.strip()
380
  if not line:
381
  continue
 
382
  parts = line.split(": ", 1)
383
  if len(parts) == 2:
384
- stored_id, stored_url = parts
385
- model_list[stored_id] = stored_url
386
  except Exception as e:
387
- logger.error(f"Failed to read model list: {e}")
388
- return model_list
 
 
 
 
 
 
389
 
390
  # ============================================================================
391
- # model 情報取得
392
  # ============================================================================
393
- def get_model_info(self, model_id: str) -> dict:
 
394
  try:
395
- url = self.config.URLS["modelId"] + str(model_id)
396
  resp = requests.get(url, headers=self.config.HEADERS)
397
  resp.raise_for_status()
398
  return resp.json()
399
- except requests.RequestException as e:
400
- logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
401
  return {}
402
 
403
  # ============================================================================
404
- # メイン処理: 最新ファイル + images はフォルダごとアップロード。old_versions は1ファイルずつ。
405
  # ============================================================================
406
  def process_model(self, model_url: str):
407
  try:
408
  model_id = model_url.rstrip("/").split("/")[-1]
409
  model_info = self.get_model_info(model_id)
410
  if not model_info:
411
- logger.error(f"No model_info returned for {model_id}")
 
 
 
 
412
  return
413
 
414
- model_versions = model_info.get("modelVersions", [])
415
- if not model_versions:
416
- logger.error(f"No modelVersions in model info {model_id}")
 
417
  return
418
 
419
- latest_version = model_versions[0]
420
- model_file = next((file for file in latest_version["files"] if file.get('type') == 'Model'), None)
421
- if model_file:
422
- latest_filename = model_file['name']
423
- folder = os.path.splitext(latest_filename)[0]
 
424
  else:
425
- first_file = latest_version["files"][0]
426
- latest_filename = first_file['name']
427
- folder = os.path.splitext(latest_filename)[0]
428
- logger.warning(f"No 'Model' type file found for model ID {model_id}. Using first file's name.")
429
 
 
430
  os.makedirs(folder, exist_ok=True)
431
-
432
- # すでにアップ済みかどうか model_list.log でチェック (モデル名ベース)
433
- model_list = self.read_model_list()
434
- modelpage_name = model_info.get("name", f"Model_{model_id}")
435
- if modelpage_name in model_list.values():
436
- logger.info(f"Model '{modelpage_name}' already in model_list. Skipping.")
437
- # 必要ならreturn
438
-
439
- # 最新バージョン (まとめてダウンロード)
440
- self.download_model(model_versions, folder)
441
-
442
- # 画像 (imagesフォルダまるごとダウンロード)
443
- self.download_images(model_versions, folder)
444
 
445
  # HTML & model_info.json
446
  self.save_html_content(self.config.URLS["modelPage"] + str(model_id), folder)
447
  self.save_model_info(model_info, folder)
448
 
449
- # 古いバージョンのみ「1つずつアップロード&削除」
450
- self.download_old_versions_one_by_one(model_versions, folder)
451
-
452
- # ↑で old_versions は空になった → あとはフォルダに残っているのは
453
- # 最新バージョンファイル・imagesフォルダ・model_info.json・HTML など
454
-
455
- # "folder" 自体を暗号化アップロード (= images フォルダごとアップロード)
456
- # path_in_repo を "" にすればフォルダ名も暗号化される
457
- self.upload_folder_encrypted(folder, path_in_repo="")
458
 
459
- # ローカルフォルダ削除
460
- if os.path.exists(folder):
461
- shutil.rmtree(folder)
 
 
462
 
463
- # model_list.log に追記
464
- # HF上では folder名 も暗号化されるが、ここでは元の "modelpage_name" と
465
- # HFへのトップフォルダ参照URLを書く
466
- model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main"
467
- with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
468
- f.write(f"{modelpage_name}: {model_hf_url}\n")
469
 
470
  except Exception as e:
471
- logger.error(f"Unexpected error processing model ({model_url}): {e}")
472
 
473
  # ============================================================================
474
- # crawl
475
  # ============================================================================
476
  async def crawl(self):
477
  while True:
478
  try:
479
  login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
 
 
 
480
 
481
- # 最新の model_list.log & civitai_backup.log をダウンロード
482
- model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE)
483
- shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
484
 
485
- local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
486
- shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
487
-
488
- # ログ読み込み
489
- with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
490
- lines = file.read().splitlines()
491
  old_models = json.loads(lines[0]) if len(lines) > 0 else []
492
  self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
493
 
494
- # 新着モデル確認
495
- response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
496
- response.raise_for_status()
497
- latest_models = response.json().get("items", [])
498
- latest_model_ids = [m["id"] for m in latest_models if "id" in m]
499
-
500
- # 差集合
501
- new_models = list(set(latest_model_ids) - set(old_models))
502
-
503
- if new_models:
504
- logger.info(f"New models found: {new_models}")
505
- model_id = new_models[0]
506
 
 
 
 
 
507
  for attempt in range(1, 6):
508
  try:
509
- self.process_model(f"{self.config.URLS['modelId']}{model_id}")
510
  break
511
  except Exception as e:
512
- logger.error(f"Failed to process model ID {model_id} (Attempt {attempt}/5): {e}")
513
  if attempt == 5:
514
- logger.error(f"Skipping model ID {model_id} after 5 failed attempts.")
515
  else:
516
  await asyncio.sleep(2)
517
  else:
518
  # 新モデルなし
519
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
520
- f.write(json.dumps(latest_model_ids) + "\n")
521
- f.write(f"{self.repo_ids['current']}\n")
522
- logger.info(f"Updated log file: {self.config.LOG_FILE}")
523
-
524
  self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
525
- logger.info("Uploaded log file to repository (unencrypted).")
526
-
527
- logger.info("No new models found.")
528
  await asyncio.sleep(60)
529
  continue
530
 
531
- # 1件アップロードに成功したら old_models に追加
532
- old_models.append(model_id)
533
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
534
  f.write(json.dumps(old_models) + "\n")
535
- f.write(f"{self.repo_ids['current']}\n")
536
- logger.info(f"Updated log file with new model ID: {model_id}")
537
 
538
- # ログと model_list.log をアップ
539
  self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
540
  self.upload_file_raw(self.config.LIST_FILE, self.repo_ids["model_list"], self.config.LIST_FILE)
541
 
542
  except Exception as e:
543
- logger.error(f"Error during crawling: {e}")
544
  await asyncio.sleep(300)
545
 
546
 
547
- # 実行
548
  config = Config()
549
  crawler = CivitAICrawler(config)
550
  app = crawler.app
 
14
  from bs4 import BeautifulSoup
15
  from fake_useragent import UserAgent
16
  from fastapi import FastAPI
17
+ from huggingface_hub import HfApi, hf_hub_download, login
18
 
19
  logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
21
 
22
 
23
  class Config:
24
+ """設定用クラス"""
25
  HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"]
26
  CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"]
27
  LOG_FILE = "civitai_backup.log"
 
35
  "latest": "https://civitai.com/api/v1/models?sort=Newest",
36
  "modelPage": "https://civitai.com/models/",
37
  "modelId": "https://civitai.com/api/v1/models/",
 
 
38
  }
39
  JST = datetime.timezone(datetime.timedelta(hours=9))
40
  UA = UserAgent()
41
  HEADERS = {
42
+ "Authorization": f'Bearer {CIVITAI_API_TOKEN}',
43
+ "User-Agent": "civitai-crawler/1.0",
44
  "Content-Type": "application/json"
45
  }
46
 
47
+ # rclone
48
  RCLONE_CONF_BASE64 = os.environ.get("RCLONE_CONF_BASE64", "")
49
  ENCRYPTED_DIR = "/home/user/app/encrypted"
50
 
51
 
52
  class CivitAICrawler:
53
+ """CivitAIからダウンロード & Hugging Faceにアップロード"""
54
 
55
  def __init__(self, config: Config):
56
  self.config = config
 
59
  self.repo_ids = self.config.REPO_IDS.copy()
60
  self.jst = self.config.JST
61
 
 
62
  self.setup_rclone_conf()
63
  self.setup_routes()
64
 
65
  def setup_routes(self):
66
  @self.app.get("/")
67
+ def root():
68
+ now = datetime.datetime.now(self.jst)
69
+ return f"Status: {now} -- current repo: {self.repo_ids['current']}"
 
 
 
 
 
 
70
 
71
  @self.app.on_event("startup")
72
+ async def on_startup():
73
  asyncio.create_task(self.crawl())
74
 
 
 
 
75
  def setup_rclone_conf(self):
76
  if not self.config.RCLONE_CONF_BASE64:
77
+ logger.warning("No RCLONE_CONF_BASE64 found.")
78
  return
79
  os.makedirs(".rclone_config", exist_ok=True)
80
  conf_path = os.path.join(".rclone_config", "rclone.conf")
81
  with open(conf_path, "wb") as f:
82
  f.write(base64.b64decode(self.config.RCLONE_CONF_BASE64))
83
  os.environ["RCLONE_CONFIG"] = conf_path
84
+ logger.info(f"rclone.conf created at: {conf_path}")
85
 
86
+ # ============================================================================
87
+ # rcloneで暗号化アップロード
88
+ # ============================================================================
89
  def encrypt_with_rclone(self, local_path: str):
 
90
  if not os.path.exists(local_path):
91
+ raise FileNotFoundError(f"Local path not found: {local_path}")
 
92
  if os.path.isdir(self.config.ENCRYPTED_DIR):
93
  shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
94
 
95
+ top_name = os.path.basename(local_path.rstrip("/")) or "unnamed"
96
+
97
+ cmd = [
98
+ "rclone", "copy",
99
+ local_path,
100
+ f"cryptLocal:{top_name}",
101
+ "-v"
102
+ ]
103
+ logger.info("Running: %s", " ".join(cmd))
104
  subprocess.run(cmd, check=True)
 
105
 
106
  if not os.path.isdir(self.config.ENCRYPTED_DIR):
107
+ raise FileNotFoundError("Encrypted dir not found after rclone copy.")
 
 
108
 
109
+ def upload_encrypted_files(self, repo_id: str, path_in_repo: str = ""):
 
110
  max_retries = 5
111
  for root, dirs, files in os.walk(self.config.ENCRYPTED_DIR):
112
  for fn in files:
113
+ enc_file_path = os.path.join(root, fn)
114
+ rel_path = os.path.relpath(enc_file_path, self.config.ENCRYPTED_DIR)
115
+ upload_path = os.path.join(path_in_repo, rel_path)
 
 
 
116
 
117
  attempt = 0
118
  while attempt < max_retries:
119
  try:
120
  self.api.upload_file(
121
+ path_or_fileobj=enc_file_path,
122
  repo_id=repo_id,
123
+ path_in_repo=upload_path
124
  )
125
+ logger.info("[OK] Uploaded => %s/%s", repo_id, upload_path)
126
  break
127
  except Exception as e:
128
  attempt += 1
129
+ msg = str(e)
130
+ # rate-limit / file-limit handle
131
+ if "rate-limited" in msg and "minutes" in msg:
132
  import re
133
+ m = re.search(r"in (\d+) minutes?", msg)
134
+ if m:
135
+ mins = int(m.group(1)) + 1
136
+ logger.warning("Rate-limited. Wait %d minutes.", mins)
137
+ time.sleep(mins * 60)
138
  attempt -= 1
139
  continue
140
+ if "you can retry this action in about 1 hour" in msg:
141
+ logger.warning("Encountered retry in 1 hour error, waiting..")
 
142
  time.sleep(3600)
143
  attempt -= 1
144
  continue
145
+ if "over the limit of 100000 files" in msg:
146
+ logger.warning("File limit exceeded. Creating new repo..")
 
147
  self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
148
  self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
149
  attempt = 0
150
  repo_id = self.repo_ids['current']
151
  continue
 
152
  if attempt < max_retries:
153
+ logger.warning("Failed to upload, retry %d/%d..", attempt, max_retries)
 
 
154
  else:
155
+ logger.error("Failed after %d attempts: %s", max_retries, enc_file_path)
156
  raise
157
 
158
  def upload_folder_encrypted(self, folder_path: str, repo_id: Optional[str] = None, path_in_repo: str = ""):
 
159
  if not repo_id:
160
  repo_id = self.repo_ids['current']
 
161
  self.encrypt_with_rclone(folder_path)
162
+ self.upload_encrypted_files(repo_id, path_in_repo=path_in_repo)
 
 
163
  if os.path.isdir(self.config.ENCRYPTED_DIR):
164
  shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
165
 
 
 
 
166
  def upload_file_encrypted_one_by_one(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: str = ""):
167
+ """古いバージョン用: ファイルを暗号化アップロード → ローカル削除。"""
 
 
168
  if not repo_id:
169
  repo_id = self.repo_ids['current']
170
 
171
  self.encrypt_with_rclone(file_path)
172
+ self.upload_encrypted_files(repo_id, path_in_repo)
 
 
173
  if os.path.isdir(self.config.ENCRYPTED_DIR):
174
  shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
 
175
  if os.path.exists(file_path):
176
  os.remove(file_path)
177
 
178
  @staticmethod
179
  def increment_repo_name(repo_id: str) -> str:
180
+ m = re.search(r'(\d+)$', repo_id)
181
+ if m:
182
+ num = int(m.group(1)) + 1
183
+ return re.sub(r'\d+$', str(num), repo_id)
184
  else:
185
+ return repo_id + "1"
186
 
187
  # ============================================================================
188
+ # 生ファイルアップロード(ログやmodel_listなど)
189
  # ============================================================================
190
  def upload_file_raw(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
191
+ if not repo_id:
192
  repo_id = self.repo_ids['current']
193
+ if not path_in_repo:
194
  path_in_repo = os.path.basename(file_path)
195
 
196
  max_retries = 5
 
202
  repo_id=repo_id,
203
  path_in_repo=path_in_repo
204
  )
205
+ logger.info("[OK] Uploaded raw %s => %s/%s", file_path, repo_id, path_in_repo)
206
  return
207
  except Exception as e:
208
  attempt += 1
209
+ msg = str(e)
210
+ if "over the limit of 100000 files" in msg:
211
+ logger.warning("File limit exceeded, creating new repo..")
212
  self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
213
  self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
214
  attempt = 0
215
  repo_id = self.repo_ids['current']
216
  continue
217
+ if "you can retry this action in about 1 hour" in msg:
218
+ logger.warning("Rate-limited 1h, waiting..")
219
  time.sleep(3600)
220
  attempt -= 1
221
  else:
222
  if attempt < max_retries:
223
+ logger.warning("Failed raw upload attempt %d/%d..", attempt, max_retries)
224
  else:
225
+ logger.error("Failed raw upload after %d attempts: %s", max_retries, file_path)
226
  raise
227
 
228
  # ============================================================================
229
+ # ダウンロード
230
  # ============================================================================
231
  @staticmethod
232
+ def get_filename_from_cd(cd: Optional[str], default_name: str) -> str:
233
+ if cd:
234
+ parts = cd.split(";")
235
+ for p in parts:
236
+ if "filename=" in p:
237
+ return p.split("=")[1].strip().strip('"')
238
  return default_name
239
 
240
+ def download_file(self, url: str, dest_folder: str, default_name: str):
241
  try:
242
+ resp = requests.get(url, headers=self.config.HEADERS, stream=True)
243
+ resp.raise_for_status()
244
  except requests.RequestException as e:
245
+ logger.error("Failed to DL from %s: %s", url, e)
246
  return None
247
 
248
+ filename = self.get_filename_from_cd(resp.headers.get('content-disposition'), default_name)
249
+ local_path = os.path.join(dest_folder, filename)
250
+ os.makedirs(dest_folder, exist_ok=True)
251
 
252
+ with open(local_path, "wb") as f:
253
+ for chunk in resp.iter_content(chunk_size=8192):
254
+ f.write(chunk)
255
 
256
+ logger.info("Downloaded: %s", local_path)
257
+ return local_path
258
 
259
  # ============================================================================
260
+ # 旧バージョンファイルだけ1ファイルずつアップロード→削除
261
  # ============================================================================
262
+ def download_old_versions_one_by_one(self, model_versions: list, folder: str):
263
+ """model_versions[1:] を対象にファイルDL→暗号化UL→削除."""
264
+ if len(model_versions) <= 1:
265
  return
266
+ old_dir = os.path.join(folder, "old_versions")
267
+ os.makedirs(old_dir, exist_ok=True)
268
+
269
+ for ver in model_versions[1:]:
270
+ for f_info in ver.get("files", []):
271
+ url = f_info["downloadUrl"]
272
+ fname = f_info["name"]
273
+ local_path = self.download_file(url, old_dir, fname)
274
+ if local_path and os.path.exists(local_path):
275
+ self.upload_file_encrypted_one_by_one(local_path)
276
+ if os.path.exists(old_dir):
277
+ shutil.rmtree(old_dir, ignore_errors=True)
 
 
 
 
 
 
 
 
 
 
278
 
279
  # ============================================================================
280
+ # 通常の(最新)モデルファイル & images フォルダを一括DL→アップロード
281
  # ============================================================================
282
+ def download_latest_files(self, model_versions: list, folder: str):
283
+ latest_files = model_versions[0].get("files", [])
284
+ os.makedirs(folder, exist_ok=True)
285
+ # 最新ファイル
286
+ for f_info in latest_files:
287
+ url = f_info["downloadUrl"]
288
+ fname = f_info["name"]
289
+ self.download_file(url, folder, fname)
 
 
 
 
 
290
 
291
  def download_images(self, model_versions: list, folder: str):
292
  images_folder = os.path.join(folder, "images")
293
  os.makedirs(images_folder, exist_ok=True)
294
 
295
+ for ver in model_versions:
296
+ for img in ver.get("images", []):
297
+ img_url = img["url"]
298
+ # 例: 49945407.jpeg --> "49945407.jpeg.png"
299
+ image_name = os.path.basename(img_url) + ".png"
300
+ self.download_file(img_url, images_folder, image_name)
 
 
 
 
 
 
 
 
 
 
 
301
 
302
  def save_html_content(self, url: str, folder: str):
303
  try:
304
+ resp = requests.get(url)
305
+ resp.raise_for_status()
306
+ html_name = os.path.basename(folder) + ".html"
307
+ html_path = os.path.join(folder, html_name)
308
+ with open(html_path, "w", encoding="utf-8") as f:
309
+ f.write(resp.text)
310
+ logger.info("Saved HTML => %s", html_path)
311
  except Exception as e:
312
+ logger.error("Failed to save HTML from %s: %s", url, e)
313
 
314
+ def save_model_info(self, model_info: dict, folder: str):
315
+ info_path = os.path.join(folder, "model_info.json")
316
+ try:
317
+ with open(info_path, "w", encoding="utf-8") as f:
318
+ json.dump(model_info, f, indent=2)
319
+ logger.info("Saved model_info => %s", info_path)
320
+ except Exception as e:
321
+ logger.error("Failed to save model_info: %s", e)
322
 
323
  # ============================================================================
324
+ # model_list.log の読み書き
325
  # ============================================================================
326
  def read_model_list(self):
327
+ ret = {}
328
+ if not os.path.exists(self.config.LIST_FILE):
329
+ return ret
330
  try:
331
  with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
332
  for line in f:
333
  line = line.strip()
334
  if not line:
335
  continue
336
+ # 例: "111123: https://huggingface.co/xxx"
337
  parts = line.split(": ", 1)
338
  if len(parts) == 2:
339
+ key, val = parts
340
+ ret[key] = val
341
  except Exception as e:
342
+ logger.error("Failed to read model_list.log: %s", e)
343
+ return ret
344
+
345
+ def append_model_list(self, key: str, val: str):
346
+ """'key: val' を model_list.log の末尾に追加."""
347
+ with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
348
+ f.write(f"{key}: {val}\n")
349
+ logger.info("Appended to model_list.log => '%s: %s'", key, val)
350
 
351
  # ============================================================================
352
+ # get_model_info
353
  # ============================================================================
354
+ def get_model_info(self, model_id: str):
355
+ url = self.config.URLS["modelId"] + str(model_id)
356
  try:
 
357
  resp = requests.get(url, headers=self.config.HEADERS)
358
  resp.raise_for_status()
359
  return resp.json()
360
+ except Exception as e:
361
+ logger.error("Failed to get model info for %s: %s", model_id, e)
362
  return {}
363
 
364
  # ============================================================================
365
+ # メイン: 1モデルを処理
366
  # ============================================================================
367
  def process_model(self, model_url: str):
368
  try:
369
  model_id = model_url.rstrip("/").split("/")[-1]
370
  model_info = self.get_model_info(model_id)
371
  if not model_info:
372
+ logger.error("No model_info for %s", model_id)
373
+ return
374
+ vers = model_info.get("modelVersions", [])
375
+ if not vers:
376
+ logger.error("No modelVersions for %s", model_id)
377
  return
378
 
379
+ # 既にアップしたかどうか model_list.log で判定(例: model_id で判定)
380
+ model_list = self.read_model_list()
381
+ if model_id in model_list:
382
+ logger.info("Model ID %s is already in model_list. Skipping..", model_id)
383
  return
384
 
385
+ # まずフォルダ名
386
+ latest_files = vers[0].get("files", [])
387
+ if latest_files:
388
+ main_file = next((f for f in latest_files if f.get("type") == "Model"), latest_files[0])
389
+ main_name = main_file["name"]
390
+ folder = os.path.splitext(main_name)[0]
391
  else:
392
+ folder = f"model_{model_id}"
 
 
 
393
 
394
+ # 1) 最新バージョン & images をまとめて download
395
  os.makedirs(folder, exist_ok=True)
396
+ self.download_latest_files(vers, folder)
397
+ self.download_images(vers, folder)
 
 
 
 
 
 
 
 
 
 
 
398
 
399
  # HTML & model_info.json
400
  self.save_html_content(self.config.URLS["modelPage"] + str(model_id), folder)
401
  self.save_model_info(model_info, folder)
402
 
403
+ # 2) old_versions を1ファイルずつアップロード
404
+ self.download_old_versions_one_by_one(vers, folder)
 
 
 
 
 
 
 
405
 
406
+ # 3) 上記が終わると old_versions は消えている。
407
+ # あとはフォルダに最新バージョンのファイル&images etc.が残っているので
408
+ # フォルダ一括で暗号化アップロード
409
+ encrypted_top_name = self.upload_folder_encrypted(folder)
410
+ shutil.rmtree(folder, ignore_errors=True)
411
 
412
+ # model_list.log に書き込み (例: "111123: https://huggingface.co/xxx/tree/main")
413
+ # 実際にフォルダ暗号化でトップレベル名が変わってるはずなので、
414
+ # ざっくり "/tree/main" を書く
415
+ hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{encrypted_top_name}"
416
+ self.append_model_list(model_id, hf_url)
 
417
 
418
  except Exception as e:
419
+ logger.error("Unexpected error in process_model(%s): %s", model_url, e)
420
 
421
  # ============================================================================
422
+ # メインループ
423
  # ============================================================================
424
  async def crawl(self):
425
  while True:
426
  try:
427
  login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
428
+ # ダウンロードしてローカルにコピー
429
+ ml_path = hf_hub_download(repo_id=self.repo_ids["model_list"], filename=self.config.LIST_FILE)
430
+ shutil.copyfile(ml_path, self.config.LIST_FILE)
431
 
432
+ log_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
433
+ shutil.copyfile(log_path, self.config.LOG_FILE)
 
434
 
435
+ with open(self.config.LOG_FILE, "r", encoding="utf-8") as f:
436
+ lines = f.read().splitlines()
 
 
 
 
437
  old_models = json.loads(lines[0]) if len(lines) > 0 else []
438
  self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
439
 
440
+ # 最新モデルチェック
441
+ r = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
442
+ r.raise_for_status()
443
+ items = r.json().get("items", [])
444
+ latest_ids = [it["id"] for it in items if "id" in it]
 
 
 
 
 
 
 
445
 
446
+ new_ids = list(set(latest_ids) - set(old_models))
447
+ if new_ids:
448
+ logger.info("New models found: %s", new_ids)
449
+ target_id = new_ids[0]
450
  for attempt in range(1, 6):
451
  try:
452
+ self.process_model(f"{self.config.URLS['modelId']}{target_id}")
453
  break
454
  except Exception as e:
455
+ logger.error("Failed to process ID %s (attempt %d/5): %s", target_id, attempt, e)
456
  if attempt == 5:
457
+ logger.error("Skipping ID %s after 5 attempts", target_id)
458
  else:
459
  await asyncio.sleep(2)
460
  else:
461
  # 新モデルなし
462
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
463
+ f.write(json.dumps(latest_ids) + "\n")
464
+ f.write(self.repo_ids["current"] + "\n")
 
 
465
  self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
466
+ logger.info("No new models found. Sleep 60s..")
 
 
467
  await asyncio.sleep(60)
468
  continue
469
 
470
+ # 成功モデルIDを old_models に追加 & civitai_backup.log 書き込み
471
+ old_models.append(target_id)
472
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
473
  f.write(json.dumps(old_models) + "\n")
474
+ f.write(self.repo_ids["current"] + "\n")
475
+ logger.info("Updated log with new ID: %s", target_id)
476
 
477
+ # logs をアップ
478
  self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
479
  self.upload_file_raw(self.config.LIST_FILE, self.repo_ids["model_list"], self.config.LIST_FILE)
480
 
481
  except Exception as e:
482
+ logger.error("Error in crawl: %s", e)
483
  await asyncio.sleep(300)
484
 
485
 
 
486
  config = Config()
487
  crawler = CivitAICrawler(config)
488
  app = crawler.app