ttttdiva commited on
Commit
4de47e3
·
verified ·
1 Parent(s): 65648d4

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +95 -123
main.py CHANGED
@@ -16,13 +16,10 @@ from fake_useragent import UserAgent
16
  from fastapi import FastAPI
17
  from huggingface_hub import HfApi, hf_hub_download, login
18
 
19
- # ロギングの設定
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
22
 
23
-
24
  class Config:
25
- """設定用のクラス"""
26
  HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"]
27
  CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"]
28
  LOG_FILE = "civitai_backup.log"
@@ -47,10 +44,7 @@ class Config:
47
  "Content-Type": "application/json"
48
  }
49
 
50
-
51
  class CivitAICrawler:
52
- """CivitAIからモデルをダウンロードし、Hugging Faceにアップロードするクラス"""
53
-
54
  def __init__(self, config: Config):
55
  import base64
56
 
@@ -76,14 +70,13 @@ class CivitAICrawler:
76
  self.setup_routes()
77
 
78
  def setup_routes(self):
79
- """FastAPIのルーティングを設定する。"""
80
  @self.app.get("/")
81
  def read_root():
82
  now = str(datetime.datetime.now(self.jst))
83
  description = f"""
84
- CivitAIを定期的に周回し新規モデルを {self.repo_ids['current']} にバックアップするspaceです。
85
- モデル一覧は https://huggingface.co/{self.repo_ids['model_list']}/blob/main/model_list.log を参照してください。
86
- Status: {now} + currently running :D
87
  """
88
  return description
89
 
@@ -93,7 +86,6 @@ class CivitAICrawler:
93
 
94
  @staticmethod
95
  def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
96
- """Content-Dispositionヘッダーからファイル名を取得する。"""
97
  if content_disposition:
98
  parts = content_disposition.split(';')
99
  for part in parts:
@@ -102,133 +94,120 @@ class CivitAICrawler:
102
  return default_name
103
 
104
  def download_file(self, url: str, destination_folder: str, default_name: str) -> Optional[str]:
105
- """指定されたURLからファイルをダウンロードし、指定されたフォルダに保存する。"""
 
106
  try:
107
- response = requests.get(url, headers=self.config.HEADERS, stream=True)
108
- response.raise_for_status()
109
  except requests.RequestException as e:
110
  logger.error(f"Failed to download file from {url}: {e}")
111
  return None
112
 
113
- filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name)
114
  file_path = os.path.join(destination_folder, filename)
115
-
116
- with open(file_path, 'wb') as file:
117
- for chunk in response.iter_content(chunk_size=8192):
118
- file.write(chunk)
119
  logger.info(f"Downloaded: {file_path}")
120
  return file_path
121
 
122
  def get_model_info(self, model_id: str) -> dict:
123
- """モデルの情報を取得する。"""
124
  try:
125
- response = requests.get(self.config.URLS["modelId"] + str(model_id), headers=self.config.HEADERS)
126
- response.raise_for_status()
127
- return response.json()
128
  except requests.RequestException as e:
129
  logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
130
  return {}
131
 
132
  def download_images(self, model_versions: list, folder: str):
133
- """画像を images フォルダにまとめてダウンロードする."""
134
  images_folder = os.path.join(folder, "images")
135
  os.makedirs(images_folder, exist_ok=True)
136
-
137
  images = []
138
- for version in model_versions:
139
- for img in version.get("images", []):
140
  images.append(img["url"])
141
 
142
  for image_url in images:
143
  image_name = os.path.basename(image_url)
144
- local_path = os.path.join(images_folder, image_name)
145
- try:
146
- resp = requests.get(image_url, stream=True)
147
- resp.raise_for_status()
148
- with open(local_path, 'wb') as imgf:
149
- for chunk in resp.iter_content(chunk_size=8192):
150
- imgf.write(chunk)
151
- logger.info(f"Downloaded image: {local_path}")
152
- except requests.RequestException as e:
153
- logger.error(f"Failed to download image {image_url}: {e}")
154
 
155
  def save_html_content(self, model_page_url: str, folder: str):
156
- """モデルページのHTMLをフォルダ内に保存する."""
157
  try:
158
  resp = requests.get(model_page_url)
159
  resp.raise_for_status()
160
- html_path = os.path.join(folder, f"{os.path.basename(folder)}.html")
 
161
  with open(html_path, 'w', encoding='utf-8') as f:
162
  f.write(resp.text)
163
  logger.info(f"Saved HTML: {html_path}")
164
  except Exception as e:
165
- logger.error(f"Error saving HTML content from {model_page_url}: {e}")
166
 
167
  def save_model_info_json(self, model_info: dict, folder: str):
168
- """モデル情報(json)の保存"""
169
  info_path = os.path.join(folder, "model_info.json")
170
  try:
171
  with open(info_path, 'w', encoding='utf-8') as f:
172
  json.dump(model_info, f, indent=2)
173
  logger.info(f"Saved model_info.json: {info_path}")
174
  except Exception as e:
175
- logger.error(f"Failed to save model info JSON: {e}")
176
 
177
  def download_and_process_versions(self, model_versions: list, folder: str):
178
  """
179
- 最新バージョン + 古いバージョンをすべて1つのフォルダにダウンロードして
180
- 最後にまとめて暗号化アップロードする。
181
  """
 
 
182
 
183
- # 1) 最新バージョンは folder
184
- latest_version = model_versions[0]
185
- for file_info in latest_version.get("files", []):
186
  download_url = file_info["downloadUrl"]
187
  file_name = file_info["name"]
188
- local_path = self.download_file(download_url, folder, file_name)
189
- # リトライ処理など省略
190
 
191
- # 2) 古いバージョンを "folder/old_versions" にまとめる
192
  if len(model_versions) > 1:
193
- old_versions_folder = os.path.join(folder, "old_versions")
194
- os.makedirs(old_versions_folder, exist_ok=True)
195
-
196
- for version in model_versions[1:]:
197
- for file_info in version.get("files", []):
198
- file_name = file_info["name"]
199
- download_url = file_info["downloadUrl"]
200
- local_path = self.download_file(download_url, old_versions_folder, file_name)
201
 
202
  def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]:
203
  """
204
- 1. rclone copy local_folder => cryptLocal: (フォルダ名はrcloneが勝手に暗号化生成)
205
- 2. 差分検知で "./encrypted" に作られた暗号フォルダ名を取得
206
- 3. そのフォルダ名を path_in_repo として Hugging Face にアップロード
207
- 4. ローカル(平文+暗号)フォルダ削除
208
- 5. 戻り値は "実際に作られた暗号フォルダ名"
209
  """
210
  if not os.path.isdir(local_folder):
211
- logger.error(f"[encrypt_and_upload_folder] Not a directory: {local_folder}")
212
  return None
213
 
214
  encrypted_base_dir = os.path.join(os.getcwd(), "encrypted")
215
  os.makedirs(encrypted_base_dir, exist_ok=True)
216
 
217
- # 既存の暗号フォルダを削除
218
  before_set = set(os.listdir(encrypted_base_dir))
219
- for item in before_set:
220
- item_path = os.path.join(encrypted_base_dir, item)
 
221
  try:
222
- if os.path.isfile(item_path):
223
- os.remove(item_path)
224
  else:
225
- shutil.rmtree(item_path)
226
- logger.info(f"[CLEANUP] Removed old encrypted item: {item_path}")
227
  except Exception as e:
228
- logger.warning(f"[CLEANUP] Failed to remove {item_path}: {e}")
229
 
230
- # === (1) rclone copy local_folder => cryptLocal: ===
231
- # これにより /home/user/app/encrypted/<ランダム暗号フォルダ> が作成される
232
  try:
233
  subprocess.run(
234
  ["rclone", "copy", local_folder, "cryptLocal:", "--create-empty-src-dirs"],
@@ -239,39 +218,35 @@ class CivitAICrawler:
239
  logger.error(f"rclone copy failed: {e}")
240
  return None
241
 
242
- # === (2) 差分検知: 新しく生成された暗号フォルダ名を取得 ===
243
  after_set = set(os.listdir(encrypted_base_dir))
244
  diff = after_set - before_set
245
  if not diff:
246
- logger.error("[ERROR] No new directory appeared in ./encrypted after rclone copy.")
247
  return None
248
  if len(diff) > 1:
249
- logger.warning(f"[WARN] Multiple new directories created: {diff}")
250
 
251
- enc_folder_name = diff.pop() # 1個だけ取り出す
252
  enc_folder_path = os.path.join(encrypted_base_dir, enc_folder_name)
253
-
254
  if not os.path.isdir(enc_folder_path):
255
  logger.error(f"[ERROR] {enc_folder_path} is not a directory.")
256
  return None
257
 
258
- # === (3) Hugging Face にアップロード ===
259
- # path_in_repo も "enc_folder_name" をそのまま使う
260
  try:
261
  self.upload_folder(enc_folder_path, path_in_repo=enc_folder_name)
262
- logger.info(f"[OK] Uploaded encrypted folder: {enc_folder_path}")
263
  except Exception as e:
264
- logger.error(f"Failed to upload encrypted folder {enc_folder_path}: {e}")
265
 
266
- # === (4) ローカル削除 (平文フォルダ + 暗号化フォルダ)
267
  try:
268
  shutil.rmtree(local_folder)
269
  shutil.rmtree(enc_folder_path)
270
- logger.info(f"[CLEANUP] Removed local folder: {local_folder} & {enc_folder_path}")
271
  except Exception as e:
272
  logger.warning(f"[CLEANUP] Could not remove local folders: {e}")
273
 
274
- # === (5) 実際の暗号フォルダ名を返す
275
  return enc_folder_name
276
 
277
  def upload_file(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
@@ -279,7 +254,7 @@ class CivitAICrawler:
279
  repo_id = self.repo_ids['current']
280
  if path_in_repo is None:
281
  path_in_repo = os.path.basename(file_path)
282
-
283
  max_retries = 5
284
  attempt = 0
285
  while attempt < max_retries:
@@ -301,7 +276,7 @@ class CivitAICrawler:
301
  attempt = 0
302
  continue
303
  elif "you can retry this action in about 1 hour" in error_message:
304
- logger.warning("Rate limit hit. Waiting 1 hour...")
305
  time.sleep(3600)
306
  attempt -= 1
307
  else:
@@ -324,24 +299,24 @@ class CivitAICrawler:
324
  repo_id=self.repo_ids['current'],
325
  path_in_repo=path_in_repo
326
  )
327
- logger.info(f"Uploaded folder: {folder_path} to {self.repo_ids['current']} at {path_in_repo}")
328
  return
329
  except Exception as e:
330
  attempt += 1
331
  error_message = str(e)
332
  if "over the limit of 100000 files" in error_message:
333
- logger.warning("File limit exceeded, creating a new repo.")
334
  self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
335
  self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
336
  attempt = 0
337
  continue
338
  elif "you can retry this action in about 1 hour" in error_message:
339
- logger.warning("Rate limit hit. Waiting 1 hour...")
340
  time.sleep(3600)
341
  attempt -= 1
342
  else:
343
  if attempt < max_retries:
344
- logger.warning(f"Failed to upload folder {folder_path}, retry {attempt}/{max_retries}")
345
  else:
346
  logger.error(f"Failed after {max_retries} attempts: {e}")
347
  raise
@@ -381,7 +356,6 @@ class CivitAICrawler:
381
  return []
382
 
383
  def process_model(self, model_url: str):
384
- """1つのモデルをフォルダにまとめてダウンロード→暗号化→アップロード"""
385
  try:
386
  model_id = model_url.rstrip("/").split("/")[-1]
387
  model_info = self.get_model_info(model_id)
@@ -399,76 +373,76 @@ class CivitAICrawler:
399
  folder_name += "_" + str(uuid.uuid4())[:8]
400
  os.makedirs(folder_name, exist_ok=True)
401
 
402
- # 最新 + 古いバージョンすべて folder_name 下へ
403
  self.download_and_process_versions(versions, folder_name)
 
404
  self.download_images(versions, folder_name)
405
- self.save_html_content(f"{self.config.URLS['modelPage']}{model_id}", folder_name)
 
 
 
406
  self.save_model_info_json(model_info, folder_name)
407
 
408
- # フォルダごと暗号化
409
- enc_subfolder = self.encrypt_and_upload_folder(folder_name)
410
- if enc_subfolder is None:
411
- enc_subfolder = "[ENCRYPT_FAILED]"
412
 
413
- hf_enc_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{enc_subfolder}"
414
  with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
415
- f.write(f"{model_info.get('name', 'UnnamedModel')} (ID:{model_id}): {hf_enc_url}\n")
416
 
417
  except Exception as e:
418
- logger.error(f"Error in process_model ({model_url}): {e}")
419
 
420
  async def crawl(self):
421
  while True:
422
  try:
423
  login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
424
 
425
- # model_list.logの取得
426
  model_list_path = hf_hub_download(
427
  repo_id=self.repo_ids['model_list'],
428
  filename=self.config.LIST_FILE
429
  )
430
  shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
431
 
432
- # ログファイルの取得
433
  local_file_path = hf_hub_download(
434
  repo_id=self.repo_ids["log"],
435
  filename=self.config.LOG_FILE
436
  )
437
  shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
438
 
439
- # ログを読み込み
440
  with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
441
  lines = file.read().splitlines()
442
  old_models = json.loads(lines[0]) if len(lines) > 0 else []
443
  self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
444
 
445
- # 新着モデル確認
446
  r = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
447
  r.raise_for_status()
448
  latest_models = r.json().get("items", [])
449
- latest_model_ids = [m["id"] for m in latest_models if "id" in m]
450
 
451
- new_models = list(set(latest_model_ids) - set(old_models))
452
- if new_models:
453
- logger.info(f"New model IDs found: {new_models}")
454
- model_id = new_models[0]
455
 
456
- for attempt in range(1, 6):
457
  try:
458
- self.process_model(self.config.URLS["modelId"] + str(model_id))
459
  break
460
  except Exception as e:
461
- logger.error(f"Failed to process model {model_id} (attempt {attempt}/5): {e}")
462
  if attempt == 5:
463
- logger.error(f"Skipping model {model_id} after 5 failures.")
464
  else:
465
  await asyncio.sleep(2)
466
 
467
- old_models.append(model_id)
468
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
469
- f.write(json.dumps(old_models) + "\n")
470
  f.write(f"{self.repo_ids['current']}\n")
471
- logger.info(f"Updated log with new model ID: {model_id}")
472
 
473
  self.upload_file(
474
  file_path=self.config.LOG_FILE,
@@ -482,9 +456,9 @@ class CivitAICrawler:
482
  )
483
  else:
484
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
485
- f.write(json.dumps(latest_model_ids) + "\n")
486
  f.write(f"{self.repo_ids['current']}\n")
487
- logger.info(f"No new models. Updated log: {self.config.LOG_FILE}")
488
  self.upload_file(
489
  file_path=self.config.LOG_FILE,
490
  repo_id=self.repo_ids["log"],
@@ -493,13 +467,11 @@ class CivitAICrawler:
493
  logger.info("Uploaded log file.")
494
  await asyncio.sleep(60)
495
  continue
496
-
497
  except Exception as e:
498
  logger.error(f"Error in crawl loop: {e}")
499
  await asyncio.sleep(300)
500
 
501
-
502
- # FastAPIアプリケーション
503
  config = Config()
504
  crawler = CivitAICrawler(config)
505
  app = crawler.app
 
16
  from fastapi import FastAPI
17
  from huggingface_hub import HfApi, hf_hub_download, login
18
 
 
19
  logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
21
 
 
22
  class Config:
 
23
  HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"]
24
  CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"]
25
  LOG_FILE = "civitai_backup.log"
 
44
  "Content-Type": "application/json"
45
  }
46
 
 
47
  class CivitAICrawler:
 
 
48
  def __init__(self, config: Config):
49
  import base64
50
 
 
70
  self.setup_routes()
71
 
72
  def setup_routes(self):
 
73
  @self.app.get("/")
74
  def read_root():
75
  now = str(datetime.datetime.now(self.jst))
76
  description = f"""
77
+ CivitAIを定期的に周回し新規モデルを {self.repo_ids['current']} にバックアップするSpaceです。
78
+ モデル一覧は https://huggingface.co/{self.repo_ids['model_list']}/blob/main/model_list.log 参照。
79
+ Status: {now} + currently running.
80
  """
81
  return description
82
 
 
86
 
87
  @staticmethod
88
  def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
 
89
  if content_disposition:
90
  parts = content_disposition.split(';')
91
  for part in parts:
 
94
  return default_name
95
 
96
  def download_file(self, url: str, destination_folder: str, default_name: str) -> Optional[str]:
97
+ """URLからファイルをダウンロードし、destination_folder に保存する。"""
98
+ os.makedirs(destination_folder, exist_ok=True) # 念のためフォルダ作成
99
  try:
100
+ resp = requests.get(url, headers=self.config.HEADERS, stream=True)
101
+ resp.raise_for_status()
102
  except requests.RequestException as e:
103
  logger.error(f"Failed to download file from {url}: {e}")
104
  return None
105
 
106
+ filename = self.get_filename_from_cd(resp.headers.get('content-disposition'), default_name)
107
  file_path = os.path.join(destination_folder, filename)
108
+ with open(file_path, 'wb') as f:
109
+ for chunk in resp.iter_content(chunk_size=8192):
110
+ f.write(chunk)
 
111
  logger.info(f"Downloaded: {file_path}")
112
  return file_path
113
 
114
  def get_model_info(self, model_id: str) -> dict:
 
115
  try:
116
+ resp = requests.get(self.config.URLS["modelId"] + str(model_id), headers=self.config.HEADERS)
117
+ resp.raise_for_status()
118
+ return resp.json()
119
  except requests.RequestException as e:
120
  logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
121
  return {}
122
 
123
  def download_images(self, model_versions: list, folder: str):
124
+ """モデル画像を folder/images にダウンロード"""
125
  images_folder = os.path.join(folder, "images")
126
  os.makedirs(images_folder, exist_ok=True)
 
127
  images = []
128
+ for ver in model_versions:
129
+ for img in ver.get("images", []):
130
  images.append(img["url"])
131
 
132
  for image_url in images:
133
  image_name = os.path.basename(image_url)
134
+ self.download_file(image_url, images_folder, image_name)
 
 
 
 
 
 
 
 
 
135
 
136
  def save_html_content(self, model_page_url: str, folder: str):
137
+ """model_page_url のHTMLを {folder}/{folder名}.html に保存"""
138
  try:
139
  resp = requests.get(model_page_url)
140
  resp.raise_for_status()
141
+ html_name = os.path.basename(folder) + ".html"
142
+ html_path = os.path.join(folder, html_name)
143
  with open(html_path, 'w', encoding='utf-8') as f:
144
  f.write(resp.text)
145
  logger.info(f"Saved HTML: {html_path}")
146
  except Exception as e:
147
+ logger.error(f"Error saving HTML from {model_page_url}: {e}")
148
 
149
  def save_model_info_json(self, model_info: dict, folder: str):
150
+ """model_info.json を folder に保存"""
151
  info_path = os.path.join(folder, "model_info.json")
152
  try:
153
  with open(info_path, 'w', encoding='utf-8') as f:
154
  json.dump(model_info, f, indent=2)
155
  logger.info(f"Saved model_info.json: {info_path}")
156
  except Exception as e:
157
+ logger.error(f"Failed to save model_info.json: {e}")
158
 
159
  def download_and_process_versions(self, model_versions: list, folder: str):
160
  """
161
+ 最新バージョンは folder/ に、
162
+ 古いバージョンは folder/old_versions/ にまとめて保存。
163
  """
164
+ if not model_versions:
165
+ return
166
 
167
+ # 最新バージョン => folder
168
+ latest_ver = model_versions[0]
169
+ for file_info in latest_ver.get("files", []):
170
  download_url = file_info["downloadUrl"]
171
  file_name = file_info["name"]
172
+ self.download_file(download_url, folder, file_name)
 
173
 
174
+ # 古いバージョン => folder/old_versions
175
  if len(model_versions) > 1:
176
+ oldv_folder = os.path.join(folder, "old_versions")
177
+ os.makedirs(oldv_folder, exist_ok=True)
178
+ for v in model_versions[1:]:
179
+ for f_info in v.get("files", []):
180
+ dl_url = f_info["downloadUrl"]
181
+ f_name = f_info["name"]
182
+ self.download_file(dl_url, oldv_folder, f_name)
 
183
 
184
  def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]:
185
  """
186
+ rclone copy local_folder => cryptLocal:
187
+ => 差分検知で "encrypted/xxxxxx" を発見 -> upload_folder -> 削除
188
+ => 戻り値は暗号フォルダ名
 
 
189
  """
190
  if not os.path.isdir(local_folder):
191
+ logger.error(f"encrypt_and_upload_folder: {local_folder} is not a directory.")
192
  return None
193
 
194
  encrypted_base_dir = os.path.join(os.getcwd(), "encrypted")
195
  os.makedirs(encrypted_base_dir, exist_ok=True)
196
 
 
197
  before_set = set(os.listdir(encrypted_base_dir))
198
+ # cleanup old stuff
199
+ for itm in before_set:
200
+ itm_path = os.path.join(encrypted_base_dir, itm)
201
  try:
202
+ if os.path.isfile(itm_path):
203
+ os.remove(itm_path)
204
  else:
205
+ shutil.rmtree(itm_path)
206
+ logger.info(f"[CLEANUP] Removed {itm_path}")
207
  except Exception as e:
208
+ logger.warning(f"[CLEANUP] Failed to remove {itm_path}: {e}")
209
 
210
+ # rclone copy local_folder => cryptLocal:
 
211
  try:
212
  subprocess.run(
213
  ["rclone", "copy", local_folder, "cryptLocal:", "--create-empty-src-dirs"],
 
218
  logger.error(f"rclone copy failed: {e}")
219
  return None
220
 
 
221
  after_set = set(os.listdir(encrypted_base_dir))
222
  diff = after_set - before_set
223
  if not diff:
224
+ logger.error("[ERROR] No new directory in ./encrypted after rclone copy.")
225
  return None
226
  if len(diff) > 1:
227
+ logger.warning(f"[WARN] multiple new dirs? {diff}")
228
 
229
+ enc_folder_name = diff.pop()
230
  enc_folder_path = os.path.join(encrypted_base_dir, enc_folder_name)
 
231
  if not os.path.isdir(enc_folder_path):
232
  logger.error(f"[ERROR] {enc_folder_path} is not a directory.")
233
  return None
234
 
235
+ # upload_folder
 
236
  try:
237
  self.upload_folder(enc_folder_path, path_in_repo=enc_folder_name)
238
+ logger.info(f"[OK] Uploaded {enc_folder_path}")
239
  except Exception as e:
240
+ logger.error(f"Failed to upload {enc_folder_path}: {e}")
241
 
242
+ # cleanup local
243
  try:
244
  shutil.rmtree(local_folder)
245
  shutil.rmtree(enc_folder_path)
246
+ logger.info(f"[CLEANUP] Removed {local_folder} & {enc_folder_path}")
247
  except Exception as e:
248
  logger.warning(f"[CLEANUP] Could not remove local folders: {e}")
249
 
 
250
  return enc_folder_name
251
 
252
  def upload_file(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
 
254
  repo_id = self.repo_ids['current']
255
  if path_in_repo is None:
256
  path_in_repo = os.path.basename(file_path)
257
+
258
  max_retries = 5
259
  attempt = 0
260
  while attempt < max_retries:
 
276
  attempt = 0
277
  continue
278
  elif "you can retry this action in about 1 hour" in error_message:
279
+ logger.warning("Rate limit. Wait 1hr.")
280
  time.sleep(3600)
281
  attempt -= 1
282
  else:
 
299
  repo_id=self.repo_ids['current'],
300
  path_in_repo=path_in_repo
301
  )
302
+ logger.info(f"Uploaded folder: {folder_path} => {self.repo_ids['current']}:{path_in_repo}")
303
  return
304
  except Exception as e:
305
  attempt += 1
306
  error_message = str(e)
307
  if "over the limit of 100000 files" in error_message:
308
+ logger.warning("File limit exceeded, creating new repo.")
309
  self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
310
  self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
311
  attempt = 0
312
  continue
313
  elif "you can retry this action in about 1 hour" in error_message:
314
+ logger.warning("Rate limit. Waiting 1hr.")
315
  time.sleep(3600)
316
  attempt -= 1
317
  else:
318
  if attempt < max_retries:
319
+ logger.warning(f"Failed to upload folder {folder_path}, attempt {attempt}/{max_retries}")
320
  else:
321
  logger.error(f"Failed after {max_retries} attempts: {e}")
322
  raise
 
356
  return []
357
 
358
  def process_model(self, model_url: str):
 
359
  try:
360
  model_id = model_url.rstrip("/").split("/")[-1]
361
  model_info = self.get_model_info(model_id)
 
373
  folder_name += "_" + str(uuid.uuid4())[:8]
374
  os.makedirs(folder_name, exist_ok=True)
375
 
376
+ # ダウンロード(最新+古い)
377
  self.download_and_process_versions(versions, folder_name)
378
+ # 画像
379
  self.download_images(versions, folder_name)
380
+ # HTML
381
+ model_page_url = f"{self.config.URLS['modelPage']}{model_id}"
382
+ self.save_html_content(model_page_url, folder_name)
383
+ # model_info.json
384
  self.save_model_info_json(model_info, folder_name)
385
 
386
+ # 最後にフォルダごとアップ
387
+ enc_folder = self.encrypt_and_upload_folder(folder_name)
388
+ if enc_folder is None:
389
+ enc_folder = "[ENCRYPT_FAILED]"
390
 
391
+ hf_enc_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{enc_folder}"
392
  with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
393
+ f.write(f"{model_info.get('name','Unknown')} (ID:{model_id}): {hf_enc_url}\n")
394
 
395
  except Exception as e:
396
+ logger.error(f"Error in process_model({model_url}): {e}")
397
 
398
  async def crawl(self):
399
  while True:
400
  try:
401
  login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
402
 
 
403
  model_list_path = hf_hub_download(
404
  repo_id=self.repo_ids['model_list'],
405
  filename=self.config.LIST_FILE
406
  )
407
  shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
408
 
 
409
  local_file_path = hf_hub_download(
410
  repo_id=self.repo_ids["log"],
411
  filename=self.config.LOG_FILE
412
  )
413
  shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
414
 
 
415
  with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
416
  lines = file.read().splitlines()
417
  old_models = json.loads(lines[0]) if len(lines) > 0 else []
418
  self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
419
 
 
420
  r = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
421
  r.raise_for_status()
422
  latest_models = r.json().get("items", [])
423
+ latest_ids = [m["id"] for m in latest_models if "id" in m]
424
 
425
+ new_ids = list(set(latest_ids) - set(old_models))
426
+ if new_ids:
427
+ logger.info(f"New model IDs found: {new_ids}")
428
+ mid = new_ids[0]
429
 
430
+ for attempt in range(1,6):
431
  try:
432
+ self.process_model(f"{self.config.URLS['modelId']}{mid}")
433
  break
434
  except Exception as e:
435
+ logger.error(f"Failed model {mid} (attempt {attempt}/5): {e}")
436
  if attempt == 5:
437
+ logger.error(f"Skipping model {mid}")
438
  else:
439
  await asyncio.sleep(2)
440
 
441
+ old_models.append(mid)
442
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
443
+ f.write(json.dumps(old_models)+"\n")
444
  f.write(f"{self.repo_ids['current']}\n")
445
+ logger.info(f"Updated log with new model ID: {mid}")
446
 
447
  self.upload_file(
448
  file_path=self.config.LOG_FILE,
 
456
  )
457
  else:
458
  with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
459
+ f.write(json.dumps(latest_ids)+"\n")
460
  f.write(f"{self.repo_ids['current']}\n")
461
+ logger.info("No new models found. Updated log.")
462
  self.upload_file(
463
  file_path=self.config.LOG_FILE,
464
  repo_id=self.repo_ids["log"],
 
467
  logger.info("Uploaded log file.")
468
  await asyncio.sleep(60)
469
  continue
 
470
  except Exception as e:
471
  logger.error(f"Error in crawl loop: {e}")
472
  await asyncio.sleep(300)
473
 
474
+ # FastAPI
 
475
  config = Config()
476
  crawler = CivitAICrawler(config)
477
  app = crawler.app